diff --git a/.clang-format b/.clang-format deleted file mode 100644 index 45232b80e..000000000 --- a/.clang-format +++ /dev/null @@ -1,161 +0,0 @@ ---- -Language: Cpp -AlignAfterOpenBracket: Align -AlignArrayOfStructures: Left -AlignConsecutiveAssignments: AcrossComments -AlignConsecutiveBitFields: AcrossComments -AlignConsecutiveDeclarations: AcrossComments -AlignConsecutiveMacros: AcrossComments -# AlignConsecutiveShortCaseStatements: AcrossComments -AlignEscapedNewlines: Left # LeftWithLastLine -AlignOperands: Align -AlignTrailingComments: - Kind: Always - OverEmptyLines: 1 -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: false -# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Inline -AllowShortIfStatementsOnASingleLine: Never -AllowShortLambdasOnASingleLine: Inline -AllowShortLoopsOnASingleLine: false -AlwaysBreakBeforeMultilineStrings: true -BinPackArguments: true -BinPackParameters: true # OnePerLine -BitFieldColonSpacing: Both -BreakBeforeBraces: Custom # Attach -BraceWrapping: - AfterCaseLabel: true - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false -# BreakAdjacentStringLiterals: true -BreakAfterAttributes: Never -BreakBeforeBinaryOperators: None -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: false -# BreakBinaryOperations: Never -BreakConstructorInitializers: AfterColon -# BreakFunctionDefinitionParameters: false -BreakInheritanceList: AfterComma -BreakStringLiterals: true -# BreakTemplateDeclarations: Yes -ColumnLimit: 120 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: false -DerivePointerAlignment: false -DisableFormat: false -EmptyLineBeforeAccessModifier: Leave -EmptyLineAfterAccessModifier: Never -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^<.*\.h>' - Priority: 1 - SortPriority: 0 - - Regex: '^<.*' - Priority: 2 - SortPriority: 0 - - Regex: '.*' - Priority: 3 - SortPriority: 0 -IncludeIsMainRegex: '([-_](test|unittest))?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: true -IndentExternBlock: NoIndent -IndentGotoLabels: false -IndentPPDirectives: AfterHash -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: true # NOTE: may lead to incorrect formatting -InsertNewlineAtEOF: true -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -LambdaBodyIndentation: Signature -LineEnding: LF -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Auto -ObjCBlockIndentWidth: 4 -ObjCSpaceAfterProperty: true -ObjCSpaceBeforeProtocolList: true -PPIndentWidth: -1 -PackConstructorInitializers: CurrentLine -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Middle -QualifierAlignment: Left -#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' -ReferenceAlignment: Middle -ReflowComments: false # IndentOnly -SeparateDefinitionBlocks: Always -SortIncludes: CaseInsensitive -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: true -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: Never -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpaceBeforeSquareBrackets: false -Standard: c++17 -TabWidth: 4 -UseTab: Never -WhitespaceSensitiveMacros: ['STRINGIZE'] -... - diff --git a/.clang-tidy b/.clang-tidy index 310c3d182..952c0cca8 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -17,10 +17,8 @@ Checks: > -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, portability-*, - -portability-simd-intrinsics, misc-*, -misc-const-correctness, -misc-non-private-member-variables-in-classes, -misc-no-recursion, - -misc-use-anonymous-namespace, FormatStyle: none diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile deleted file mode 100644 index 522ee8147..000000000 --- a/.devops/cpu.Dockerfile +++ /dev/null @@ -1,92 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -ARG TARGETARCH - -ARG GGML_CPU_ARM_ARCH=armv8-a - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN if [ "$TARGETARCH" = "amd64" ]; then \ - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \ - elif [ "$TARGETARCH" = "arm64" ]; then \ - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \ - else \ - echo "Unsupported architecture"; \ - exit 1; \ - fi && \ - cmake --build build -j $(nproc) - -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh - -## Base image -FROM ubuntu:$UBUNTU_VERSION AS base - -RUN apt-get update \ - && apt-get install -y libgomp1 curl\ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -COPY --from=build /app/lib/ /app - -### Full -FROM base AS full - -COPY --from=build /app/full /app - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -ENTRYPOINT ["/app/tools.sh"] - -### Light, CLI only -FROM base AS light - -COPY --from=build /app/full/llama-cli /app - -WORKDIR /app - -ENTRYPOINT [ "/app/llama-cli" ] - -### Server, Server only -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -COPY --from=build /app/full/llama-server /app - -WORKDIR /app - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile deleted file mode 100644 index 974dd78a8..000000000 --- a/.devops/cuda.Dockerfile +++ /dev/null @@ -1,94 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} - -ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 - -WORKDIR /app - -COPY . . - -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) - -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh - -## Base image -FROM ${BASE_CUDA_RUN_CONTAINER} AS base - -RUN apt-get update \ - && apt-get install -y libgomp1 curl\ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -COPY --from=build /app/lib/ /app - -### Full -FROM base AS full - -COPY --from=build /app/full /app - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - - -ENTRYPOINT ["/app/tools.sh"] - -### Light, CLI only -FROM base AS light - -COPY --from=build /app/full/llama-cli /app - -WORKDIR /app - -ENTRYPOINT [ "/app/llama-cli" ] - -### Server, Server only -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -COPY --from=build /app/full/llama-server /app - -WORKDIR /app - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile new file mode 100644 index 000000000..61f671465 --- /dev/null +++ b/.devops/full-cuda.Dockerfile @@ -0,0 +1,36 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 + +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable CUDA +ENV GGML_CUDA=1 +# Enable cURL +ENV LLAMA_CURL=1 + +RUN make -j$(nproc) + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile new file mode 100644 index 000000000..680d1cb92 --- /dev/null +++ b/.devops/full-rocm.Dockerfile @@ -0,0 +1,50 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV GGML_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +# Enable cURL +ENV LLAMA_CURL=1 +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + +RUN make -j$(nproc) + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile new file mode 100644 index 000000000..2a06f82b7 --- /dev/null +++ b/.devops/full.Dockerfile @@ -0,0 +1,25 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION AS build + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +ENV LLAMA_CURL=1 + + +RUN make -j$(nproc) + +ENV LC_ALL=C.utf8 + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile deleted file mode 100644 index af783f5e9..000000000 --- a/.devops/intel.Dockerfile +++ /dev/null @@ -1,91 +0,0 @@ -ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 - -## Build Image - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build - -ARG GGML_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ - echo "GGML_SYCL_F16 is set" \ - && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ - fi && \ - echo "Building with dynamic libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ - cmake --build build --config Release -j$(nproc) - -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base - -RUN apt-get update \ - && apt-get install -y libgomp1 curl\ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -### Full -FROM base AS full - -COPY --from=build /app/lib/ /app -COPY --from=build /app/full /app - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - - -ENTRYPOINT ["/app/tools.sh"] - -### Light, CLI only -FROM base AS light - -COPY --from=build /app/lib/ /app -COPY --from=build /app/full/llama-cli /app - -WORKDIR /app - -ENTRYPOINT [ "/app/llama-cli" ] - -### Server, Server only -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -COPY --from=build /app/lib/ /app -COPY --from=build /app/full/llama-server /app - -WORKDIR /app - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] - diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile deleted file mode 100644 index 02dce501c..000000000 --- a/.devops/llama-cli-cann.Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8 - -FROM ascendai/cann:$ASCEND_VERSION AS build - -WORKDIR /app - -COPY . . - -RUN yum install -y gcc g++ cmake make -ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest -ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH -ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH} -ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH} -ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH} -ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME} -ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp -ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit -ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME} - -# find libascend_hal.so, because the drive hasn`t been mounted. -ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH - -RUN echo "Building with static libs" && \ - source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ - cmake --build build --config Release --target llama-cli - -# TODO: use image with NNRT -FROM ascendai/cann:$ASCEND_VERSION AS runtime -COPY --from=build /app/build/bin/llama-cli /llama-cli - -ENV LC_ALL=C.utf8 - -ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest -ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH -ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH} -ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH} -ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH} -ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME} -ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp -ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit -ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME} - -ENTRYPOINT ["/llama-cli" ] diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile new file mode 100644 index 000000000..8eda63a89 --- /dev/null +++ b/.devops/llama-cli-cuda.Dockerfile @@ -0,0 +1,35 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential git + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable CUDA +ENV GGML_CUDA=1 + +RUN make -j$(nproc) llama-cli + +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime + +RUN apt-get update && \ + apt-get install -y libgomp1 + +COPY --from=build /app/llama-cli /llama-cli + +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile new file mode 100644 index 000000000..2bf82bb58 --- /dev/null +++ b/.devops/llama-cli-intel.Dockerfile @@ -0,0 +1,26 @@ +ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build + +ARG GGML_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git + +WORKDIR /app + +COPY . . + +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ + fi && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake --build build --config Release --target llama-cli + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime + +COPY --from=build /app/build/bin/llama-cli /llama-cli + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile new file mode 100644 index 000000000..c3d1ab067 --- /dev/null +++ b/.devops/llama-cli-rocm.Dockerfile @@ -0,0 +1,45 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV GGML_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make -j$(nproc) llama-cli + +ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile new file mode 100644 index 000000000..9b0dad8bf --- /dev/null +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -0,0 +1,27 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION AS build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget libgomp1 + +# Install Vulkan SDK +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk + +# Build it +WORKDIR /app +COPY . . +RUN cmake -B build -DGGML_VULKAN=1 && \ + cmake --build build --config Release --target llama-cli + +# Clean up +WORKDIR / +RUN cp /app/build/bin/llama-cli /llama-cli && \ + rm -rf /app + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile new file mode 100644 index 000000000..7f741aa46 --- /dev/null +++ b/.devops/llama-cli.Dockerfile @@ -0,0 +1,23 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION AS build + +RUN apt-get update && \ + apt-get install -y build-essential git + +WORKDIR /app + +COPY . . + +RUN make -j$(nproc) llama-cli + +FROM ubuntu:$UBUNTU_VERSION AS runtime + +RUN apt-get update && \ + apt-get install -y libgomp1 + +COPY --from=build /app/llama-cli /llama-cli + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile new file mode 100644 index 000000000..67328cf1c --- /dev/null +++ b/.devops/llama-server-cuda.Dockerfile @@ -0,0 +1,39 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable CUDA +ENV GGML_CUDA=1 +# Enable cURL +ENV LLAMA_CURL=1 + +RUN make -j$(nproc) llama-server + +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev libgomp1 curl + +COPY --from=build /app/llama-server /llama-server + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile new file mode 100644 index 000000000..eb9aba618 --- /dev/null +++ b/.devops/llama-server-intel.Dockerfile @@ -0,0 +1,31 @@ +ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build + +ARG GGML_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ + fi && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake --build build --config Release --target llama-server + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev curl + +COPY --from=build /app/build/bin/llama-server /llama-server + +ENV LC_ALL=C.utf8 + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile new file mode 100644 index 000000000..763b4cd3f --- /dev/null +++ b/.devops/llama-server-rocm.Dockerfile @@ -0,0 +1,52 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV GGML_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +# Enable cURL +ENV LLAMA_CURL=1 +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev curl + +RUN make -j$(nproc) llama-server + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile new file mode 100644 index 000000000..13a61ffd8 --- /dev/null +++ b/.devops/llama-server-vulkan.Dockerfile @@ -0,0 +1,29 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION AS build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK and cURL +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk libcurl4-openssl-dev curl + +# Build it +WORKDIR /app +COPY . . +RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ + cmake --build build --config Release --target llama-server + +# Clean up +WORKDIR / +RUN cp /app/build/bin/llama-server /llama-server && \ + rm -rf /app + +ENV LC_ALL=C.utf8 + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile new file mode 100644 index 000000000..b631d5806 --- /dev/null +++ b/.devops/llama-server.Dockerfile @@ -0,0 +1,27 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION AS build + +RUN apt-get update && \ + apt-get install -y build-essential git libcurl4-openssl-dev curl + +WORKDIR /app + +COPY . . + +ENV LLAMA_CURL=1 + +RUN make -j$(nproc) llama-server + +FROM ubuntu:$UBUNTU_VERSION AS runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev libgomp1 + +COPY --from=build /app/llama-server /llama-server + +ENV LC_ALL=C.utf8 + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile deleted file mode 100644 index bfd7fc1c1..000000000 --- a/.devops/musa.Dockerfile +++ /dev/null @@ -1,108 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} - -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y \ - build-essential \ - cmake \ - python3 \ - python3-pip \ - git \ - libcurl4-openssl-dev \ - libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) - -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh - -## Base image -FROM ${BASE_MUSA_RUN_CONTAINER} AS base - -RUN apt-get update \ - && apt-get install -y libgomp1 curl\ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -COPY --from=build /app/lib/ /app - -### Full -FROM base AS full - -COPY --from=build /app/full /app - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - - -ENTRYPOINT ["/app/tools.sh"] - -### Light, CLI only -FROM base AS light - -COPY --from=build /app/full/llama-cli /app - -WORKDIR /app - -ENTRYPOINT [ "/app/llama-cli" ] - -### Server, Server only -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -COPY --from=build /app/full/llama-server /app - -WORKDIR /app - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix index 0ecf19fc5..897fce4d3 100644 --- a/.devops/nix/apps.nix +++ b/.devops/nix/apps.nix @@ -10,6 +10,7 @@ "llama-embedding" "llama-server" "llama-quantize" + "llama-train-text-from-scratch" ]; mkApp = name: { type = "app"; diff --git a/.devops/nix/devshells.nix b/.devops/nix/devshells.nix index bfd304af1..1862f0f08 100644 --- a/.devops/nix/devshells.nix +++ b/.devops/nix/devshells.nix @@ -1,52 +1,13 @@ -{ inputs, ... }: - { perSystem = - { - config, - lib, - system, - ... - }: + { config, lib, ... }: { devShells = - let - pkgs = import inputs.nixpkgs { inherit system; }; - stdenv = pkgs.stdenv; - scripts = config.packages.python-scripts; - in - lib.pipe (config.packages) [ - (lib.concatMapAttrs ( - name: package: { - ${name} = pkgs.mkShell { - name = "${name}"; - inputsFrom = [ package ]; - shellHook = '' - echo "Entering ${name} devShell" - ''; - }; - "${name}-extra" = - if (name == "python-scripts") then - null - else - pkgs.mkShell { - name = "${name}-extra"; - inputsFrom = [ - package - scripts - ]; - # Extra packages that *may* be used by some scripts - packages = [ - pkgs.python3Packages.tiktoken - ]; - shellHook = '' - echo "Entering ${name} devShell" - addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib" - ''; - }; - } - )) - (lib.filterAttrs (name: value: value != null)) - ]; + lib.concatMapAttrs + (name: package: { + ${name} = package.passthru.shell; + ${name + "-extra"} = package.passthru.shell-extra; + }) + config.packages; }; } diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix index 90d683a71..4a2f81c4b 100644 --- a/.devops/nix/nixpkgs-instances.nix +++ b/.devops/nix/nixpkgs-instances.nix @@ -26,14 +26,16 @@ config.cudaSupport = true; config.allowUnfreePredicate = p: - builtins.all ( - license: - license.free - || builtins.elem license.shortName [ - "CUDA EULA" - "cuDNN EULA" - ] - ) (p.meta.licenses or [ p.meta.license ]); + builtins.all + ( + license: + license.free + || builtins.elem license.shortName [ + "CUDA EULA" + "cuDNN EULA" + ] + ) + (p.meta.licenses or [ p.meta.license ]); }; # Ensure dependencies use ROCm consistently pkgsRocm = import inputs.nixpkgs { diff --git a/.devops/nix/package-gguf-py.nix b/.devops/nix/package-gguf-py.nix deleted file mode 100644 index cca2f36a5..000000000 --- a/.devops/nix/package-gguf-py.nix +++ /dev/null @@ -1,36 +0,0 @@ -{ - lib, - llamaVersion, - numpy, - tqdm, - sentencepiece, - pyyaml, - poetry-core, - buildPythonPackage, - pytestCheckHook, -}: - -buildPythonPackage { - pname = "gguf"; - version = llamaVersion; - pyproject = true; - nativeBuildInputs = [ poetry-core ]; - propagatedBuildInputs = [ - numpy - tqdm - sentencepiece - pyyaml - ]; - src = lib.cleanSource ../../gguf-py; - pythonImportsCheck = [ - "numpy" - "gguf" - ]; - nativeCheckInputs = [ pytestCheckHook ]; - doCheck = true; - meta = with lib; { - description = "Python package for writing binary files in the GGUF format"; - license = licenses.mit; - maintainers = [ maintainers.ditsuke ]; - }; -} diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 043c4364b..911c42ecb 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -3,35 +3,32 @@ glibc, config, stdenv, + mkShell, runCommand, cmake, ninja, pkg-config, git, + python3, mpi, blas, cudaPackages, - autoAddDriverRunpath, darwin, rocmPackages, vulkan-headers, vulkan-loader, curl, shaderc, - useBlas ? - builtins.all (x: !x) [ - useCuda - useMetalKit - useRocm - useVulkan - ] - && blas.meta.available, + useBlas ? builtins.all (x: !x) [ + useCuda + useMetalKit + useRocm + useVulkan + ] && blas.meta.available, useCuda ? config.cudaSupport, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, - # Increases the runtime closure size by ~700M - useMpi ? false, + useMpi ? false, # Increases the runtime closure size by ~700M useRocm ? config.rocmSupport, - rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets, enableCurl ? true, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -40,8 +37,8 @@ # otherwise we get libstdc++ errors downstream. effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, enableStatic ? effectiveStdenv.hostPlatform.isStatic, - precompileMetalShaders ? false, -}: + precompileMetalShaders ? false +}@inputs: let inherit (lib) @@ -49,6 +46,7 @@ let cmakeFeature optionals strings + versionOlder ; stdenv = throw "Use effectiveStdenv instead"; @@ -64,11 +62,54 @@ let pnameSuffix = strings.optionalString (suffices != [ ]) "-${strings.concatMapStringsSep "-" strings.toLower suffices}"; - descriptionSuffix = strings.optionalString ( - suffices != [ ] - ) ", accelerated with ${strings.concatStringsSep ", " suffices}"; + descriptionSuffix = + strings.optionalString (suffices != [ ]) + ", accelerated with ${strings.concatStringsSep ", " suffices}"; - xcrunHost = runCommand "xcrunHost" { } '' + executableSuffix = effectiveStdenv.hostPlatform.extensions.executable; + + # TODO: package the Python in this repository in a Nix-like way. + # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo + # is PEP 517-compatible, and ensure the correct .dist-info is generated. + # https://peps.python.org/pep-0517/ + # + # TODO: Package up each Python script or service appropriately, by making + # them into "entrypoints" + llama-python = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ] + ); + + # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime + llama-python-extra = python3.withPackages ( + ps: [ + ps.numpy + ps.sentencepiece + ps.tiktoken + ps.torchWithoutCuda + ps.transformers + + # server bench + ps.matplotlib + + # server tests + ps.openai + ps.behave + ps.prometheus-client + + # for examples/pydantic-models-to-grammar-examples.py + ps.docstring-parser + ps.pydantic + + # for scripts/compare-llama-bench.py + ps.gitpython + ps.tabulate + ] + ); + + xcrunHost = runCommand "xcrunHost" {} '' mkdir -p $out/bin ln -s /usr/bin/xcrun $out/bin ''; @@ -85,9 +126,16 @@ let ++ optionals useMetalKit [ MetalKit ]; cudaBuildInputs = with cudaPackages; [ - cuda_cudart - cuda_cccl # - libcublas + cuda_cccl.dev # + + # A temporary hack for reducing the closure size, remove once cudaPackages + # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 + cuda_cudart.dev + cuda_cudart.lib + cuda_cudart.static + libcublas.dev + libcublas.lib + libcublas.static ]; rocmBuildInputs = with rocmPackages; [ @@ -103,145 +151,181 @@ let ]; in -effectiveStdenv.mkDerivation (finalAttrs: { - pname = "llama-cpp${pnameSuffix}"; - version = llamaVersion; +effectiveStdenv.mkDerivation ( + finalAttrs: { + pname = "llama-cpp${pnameSuffix}"; + version = llamaVersion; - # Note: none of the files discarded here are visible in the sandbox or - # affect the output hash. This also means they can be modified without - # triggering a rebuild. - src = lib.cleanSourceWith { - filter = - name: type: - let - noneOf = builtins.all (x: !x); - baseName = baseNameOf name; - in - noneOf [ - (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths - (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths - (lib.hasPrefix "." baseName) # Skip hidden files and directories - (baseName == "flake.lock") + # Note: none of the files discarded here are visible in the sandbox or + # affect the output hash. This also means they can be modified without + # triggering a rebuild. + src = lib.cleanSourceWith { + filter = + name: type: + let + noneOf = builtins.all (x: !x); + baseName = baseNameOf name; + in + noneOf [ + (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths + (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths + (lib.hasPrefix "." baseName) # Skip hidden files and directories + (baseName == "flake.lock") + ]; + src = lib.cleanSource ../../.; + }; + + postPatch = '' + substituteInPlace ./ggml/src/ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + substituteInPlace ./ggml/src/ggml-metal.m \ + --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" + ''; + + # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # `default.metallib` may be compiled with Metal compiler from XCode + # and we need to escape sandbox on MacOS to access Metal compiler. + # `xcrun` is used find the path of the Metal compiler, which is varible + # and not on $PATH + # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; + + nativeBuildInputs = + [ + cmake + ninja + pkg-config + git + ] + ++ optionals useCuda [ + cudaPackages.cuda_nvcc + + # TODO: Replace with autoAddDriverRunpath + # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged + cudaPackages.autoAddOpenGLRunpathHook + ] + ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ + glibc.static + ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ + xcrunHost ]; - src = lib.cleanSource ../../.; - }; - postPatch = '' - substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \ - --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" - ''; + buildInputs = + optionals effectiveStdenv.isDarwin darwinBuildInputs + ++ optionals useCuda cudaBuildInputs + ++ optionals useMpi [ mpi ] + ++ optionals useRocm rocmBuildInputs + ++ optionals useBlas [ blas ] + ++ optionals useVulkan vulkanBuildInputs + ++ optionals enableCurl [ curl ]; - # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, - # `default.metallib` may be compiled with Metal compiler from XCode - # and we need to escape sandbox on MacOS to access Metal compiler. - # `xcrun` is used find the path of the Metal compiler, which is varible - # and not on $PATH - # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion - __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; - - nativeBuildInputs = - [ - cmake - ninja - pkg-config - git - ] - ++ optionals useCuda [ - cudaPackages.cuda_nvcc - - autoAddDriverRunpath - ] - ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ] - ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ]; - - buildInputs = - optionals effectiveStdenv.isDarwin darwinBuildInputs - ++ optionals useCuda cudaBuildInputs - ++ optionals useMpi [ mpi ] - ++ optionals useRocm rocmBuildInputs - ++ optionals useBlas [ blas ] - ++ optionals useVulkan vulkanBuildInputs - ++ optionals enableCurl [ curl ]; - - cmakeFlags = - [ - (cmakeBool "LLAMA_BUILD_SERVER" true) - (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) - (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) - (cmakeBool "LLAMA_CURL" enableCurl) - (cmakeBool "GGML_NATIVE" false) - (cmakeBool "GGML_BLAS" useBlas) - (cmakeBool "GGML_CUDA" useCuda) - (cmakeBool "GGML_HIP" useRocm) - (cmakeBool "GGML_METAL" useMetalKit) - (cmakeBool "GGML_VULKAN" useVulkan) - (cmakeBool "GGML_STATIC" enableStatic) - ] - ++ optionals useCuda [ - ( - with cudaPackages.flags; - cmakeFeature "CMAKE_CUDA_ARCHITECTURES" ( - builtins.concatStringsSep ";" (map dropDot cudaCapabilities) + cmakeFlags = + [ + (cmakeBool "LLAMA_BUILD_SERVER" true) + (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) + (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) + (cmakeBool "LLAMA_CURL" enableCurl) + (cmakeBool "GGML_NATIVE" false) + (cmakeBool "GGML_BLAS" useBlas) + (cmakeBool "GGML_CUDA" useCuda) + (cmakeBool "GGML_HIPBLAS" useRocm) + (cmakeBool "GGML_METAL" useMetalKit) + (cmakeBool "GGML_VULKAN" useVulkan) + (cmakeBool "GGML_STATIC" enableStatic) + ] + ++ optionals useCuda [ + ( + with cudaPackages.flags; + cmakeFeature "CMAKE_CUDA_ARCHITECTURES" ( + builtins.concatStringsSep ";" (map dropDot cudaCapabilities) + ) ) - ) - ] - ++ optionals useRocm [ - (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") - (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets) - ] - ++ optionals useMetalKit [ - (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") - (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) - ]; + ] + ++ optionals useRocm [ + (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") + (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) + ] + ++ optionals useMetalKit [ + (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") + (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) + ]; - # Environment variables needed for ROCm - env = optionals useRocm { - ROCM_PATH = "${rocmPackages.clr}"; - HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode"; - }; + # Environment variables needed for ROCm + env = optionals useRocm { + ROCM_PATH = "${rocmPackages.clr}"; + HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode"; + }; - # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, - # if they haven't been added yet. - postInstall = '' - mkdir -p $out/include - cp $src/include/llama.h $out/include/ - ''; + # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, + # if they haven't been added yet. + postInstall = '' + mkdir -p $out/include + cp $src/include/llama.h $out/include/ + ''; - meta = { - # Configurations we don't want even the CI to evaluate. Results in the - # "unsupported platform" messages. This is mostly a no-op, because - # cudaPackages would've refused to evaluate anyway. - badPlatforms = optionals useCuda lib.platforms.darwin; + # Define the shells here, but don't add in the inputsFrom to avoid recursion. + passthru = { + inherit + useBlas + useCuda + useMetalKit + useMpi + useRocm + useVulkan + ; - # Configurations that are known to result in build failures. Can be - # overridden by importing Nixpkgs with `allowBroken = true`. - broken = (useMetalKit && !effectiveStdenv.isDarwin); + shell = mkShell { + name = "shell-${finalAttrs.finalPackage.name}"; + description = "contains numpy and sentencepiece"; + buildInputs = [ llama-python ]; + inputsFrom = [ finalAttrs.finalPackage ]; + shellHook = '' + addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib" + ''; + }; - description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; - homepage = "https://github.com/ggerganov/llama.cpp/"; - license = lib.licenses.mit; + shell-extra = mkShell { + name = "shell-extra-${finalAttrs.finalPackage.name}"; + description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers"; + buildInputs = [ llama-python-extra ]; + inputsFrom = [ finalAttrs.finalPackage ]; + }; + }; - # Accommodates `nix run` and `lib.getExe` - mainProgram = "llama-cli"; + meta = { + # Configurations we don't want even the CI to evaluate. Results in the + # "unsupported platform" messages. This is mostly a no-op, because + # cudaPackages would've refused to evaluate anyway. + badPlatforms = optionals useCuda lib.platforms.darwin; - # These people might respond, on the best effort basis, if you ping them - # in case of Nix-specific regressions or for reviewing Nix-specific PRs. - # Consider adding yourself to this list if you want to ensure this flake - # stays maintained and you're willing to invest your time. Do not add - # other people without their consent. Consider removing people after - # they've been unreachable for long periods of time. + # Configurations that are known to result in build failures. Can be + # overridden by importing Nixpkgs with `allowBroken = true`. + broken = (useMetalKit && !effectiveStdenv.isDarwin); - # Note that lib.maintainers is defined in Nixpkgs, but you may just add - # an attrset following the same format as in - # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix - maintainers = with lib.maintainers; [ - philiptaron - SomeoneSerge - ]; + description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; + homepage = "https://github.com/ggerganov/llama.cpp/"; + license = lib.licenses.mit; - # Extend `badPlatforms` instead - platforms = lib.platforms.all; - }; -}) + # Accommodates `nix run` and `lib.getExe` + mainProgram = "llama-cli"; + + # These people might respond, on the best effort basis, if you ping them + # in case of Nix-specific regressions or for reviewing Nix-specific PRs. + # Consider adding yourself to this list if you want to ensure this flake + # stays maintained and you're willing to invest your time. Do not add + # other people without their consent. Consider removing people after + # they've been unreachable for long periods of time. + + # Note that lib.maintainers is defined in Nixpkgs, but you may just add + # an attrset following the same format as in + # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix + maintainers = with lib.maintainers; [ + philiptaron + SomeoneSerge + ]; + + # Extend `badPlatforms` instead + platforms = lib.platforms.all; + }; + } +) diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix deleted file mode 100644 index 56ea18278..000000000 --- a/.devops/nix/python-scripts.nix +++ /dev/null @@ -1,66 +0,0 @@ -{ - lib, - stdenv, - buildPythonPackage, - poetry-core, - mkShell, - python3Packages, - gguf-py, -}@inputs: - -let - llama-python-deps = with python3Packages; [ - numpy - sentencepiece - transformers - protobuf - torchWithoutCuda - gguf-py - tqdm - - # for scripts/compare-llama-bench.py - gitpython - tabulate - - # for examples/pydantic-models-to-grammar-examples.py - docstring-parser - pydantic - - ]; - - llama-python-test-deps = with python3Packages; [ - # Server bench - matplotlib - - # server tests - openai - pytest - prometheus-client - ]; -in - -buildPythonPackage ({ - pname = "llama-scripts"; - version = "0.0.0"; - pyproject = true; - - # NOTE: The files filtered out here are not visible in the build sandbox, neither - # do they affect the output hash. They can be modified without triggering a rebuild. - src = lib.cleanSourceWith { - filter = - name: type: - let - any = builtins.any (x: x); - baseName = builtins.baseNameOf name; - in - any [ - (lib.hasSuffix ".py" name) - (baseName == "README.md") - (baseName == "pyproject.toml") - ]; - src = lib.cleanSource ../../.; - }; - nativeBuildInputs = [ poetry-core ]; - nativeCheckInputs = llama-python-test-deps; - dependencies = llama-python-deps; -}) diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix index 478e8c422..78530c9e8 100644 --- a/.devops/nix/scope.nix +++ b/.devops/nix/scope.nix @@ -1,41 +1,19 @@ { lib, newScope, - python3, llamaVersion ? "0.0.0", }: -let - pythonPackages = python3.pkgs; - buildPythonPackage = pythonPackages.buildPythonPackage; - numpy = pythonPackages.numpy; - tqdm = pythonPackages.tqdm; - sentencepiece = pythonPackages.sentencepiece; - pyyaml = pythonPackages.pyyaml; - poetry-core = pythonPackages.poetry-core; - pytestCheckHook = pythonPackages.pytestCheckHook; -in - # We're using `makeScope` instead of just writing out an attrset # because it allows users to apply overlays later using `overrideScope'`. # Cf. https://noogle.dev/f/lib/makeScope -lib.makeScope newScope (self: { - inherit llamaVersion; - gguf-py = self.callPackage ./package-gguf-py.nix { - inherit - buildPythonPackage - numpy - tqdm - sentencepiece - poetry-core - pyyaml - pytestCheckHook - ; - }; - python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; }; - llama-cpp = self.callPackage ./package.nix { }; - docker = self.callPackage ./docker.nix { }; - docker-min = self.callPackage ./docker.nix { interactive = false; }; - sif = self.callPackage ./sif.nix { }; -}) +lib.makeScope newScope ( + self: { + inherit llamaVersion; + llama-cpp = self.callPackage ./package.nix { }; + docker = self.callPackage ./docker.nix { }; + docker-min = self.callPackage ./docker.nix { interactive = false; }; + sif = self.callPackage ./sif.nix { }; + } +) diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile deleted file mode 100644 index a8088ea00..000000000 --- a/.devops/rocm.Dockerfile +++ /dev/null @@ -1,113 +0,0 @@ -ARG UBUNTU_VERSION=24.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=6.3 -ARG AMDGPU_VERSION=6.3 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -### Build image -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported -# gfx906 is deprecated -#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html - -#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102' -ARG ROCM_DOCKER_ARCH=gfx1100 - -# Set nvcc architectured -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -# ENV CC=/opt/rocm/llvm/bin/clang -# ENV CXX=/opt/rocm/llvm/bin/clang++ - -RUN apt-get update \ - && apt-get install -y \ - build-essential \ - cmake \ - git \ - libcurl4-openssl-dev \ - curl \ - libgomp1 - -WORKDIR /app - -COPY . . - -RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \ - && cmake --build build --config Release -j$(nproc) - -RUN mkdir -p /app/lib \ - && find build -name "*.so" -exec cp {} /app/lib \; - -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh - -## Base image -FROM ${BASE_ROCM_DEV_CONTAINER} AS base - -RUN apt-get update \ - && apt-get install -y libgomp1 curl\ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -COPY --from=build /app/lib/ /app - -### Full -FROM base AS full - -COPY --from=build /app/full /app - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3-pip \ - python3 \ - python3-wheel\ - && pip install --break-system-packages --upgrade setuptools \ - && pip install --break-system-packages -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -ENTRYPOINT ["/app/tools.sh"] - -### Light, CLI only -FROM base AS light - -COPY --from=build /app/full/llama-cli /app - -WORKDIR /app - -ENTRYPOINT [ "/app/llama-cli" ] - -### Server, Server only -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -COPY --from=build /app/full/llama-server /app - -WORKDIR /app - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/tools.sh b/.devops/tools.sh index 41a6b1e55..cf0e8f32d 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -8,40 +8,36 @@ arg1="$1" shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then - exec python3 ./convert_hf_to_gguf.py "$@" + python3 ./convert_hf_to_gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - exec ./llama-quantize "$@" + ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - exec ./llama-cli "$@" -elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then - exec ./llama-bench "$@" -elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then - exec ./llama-perplexity "$@" + ./llama-cli "$@" +elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then + ./llama-finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." - for i in $(ls $1/$2/ggml-model-f16.bin*); do + for i in `ls $1/$2/ggml-model-f16.bin*`; do if [ -f "${i/f16/q4_0}" ]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 + ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - exec ./llama-server "$@" + ./llama-server "$@" else echo "Unknown command: $arg1" echo "Available commands: " echo " --run (-r): Run a model previously converted into ggml" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" - echo " --bench (-b): Benchmark the performance of the inference for various parameters." - echo " ex: -m model.gguf" - echo " --perplexity (-p): Measure the perplexity of a model over a given text." - echo " ex: -m model.gguf -f file.txt" echo " --convert (-c): Convert a llama model into ggml" echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --finetune (-f): Run finetune command to create a lora finetune of the model" + echo " See documentation for finetune for command-line parameters" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" echo " --server (-s): Run a model on the server" diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile deleted file mode 100644 index 9064f3838..000000000 --- a/.devops/vulkan.Dockerfile +++ /dev/null @@ -1,89 +0,0 @@ -ARG UBUNTU_VERSION=24.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -# Install build tools -RUN apt update && apt install -y git build-essential cmake wget - -# Install Vulkan SDK and cURL -RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \ - apt update -y && \ - apt-get install -y vulkan-sdk libcurl4-openssl-dev curl - -# Build it -WORKDIR /app - -COPY . . - -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ - cmake --build build --config Release -j$(nproc) - -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh - -## Base image -FROM ubuntu:$UBUNTU_VERSION AS base - -RUN apt-get update \ - && apt-get install -y libgomp1 curl libvulkan-dev \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -COPY --from=build /app/lib/ /app - -### Full -FROM base AS full - -COPY --from=build /app/full /app - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - python3-wheel \ - && pip install --break-system-packages --upgrade setuptools \ - && pip install --break-system-packages -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - -ENTRYPOINT ["/app/tools.sh"] - -### Light, CLI only -FROM base AS light - -COPY --from=build /app/full/llama-cli /app - -WORKDIR /app - -ENTRYPOINT [ "/app/llama-cli" ] - -### Server, Server only -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -COPY --from=build /app/full/llama-server /app - -WORKDIR /app - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.dockerignore b/.dockerignore index 064b7c7be..8916e2a66 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,7 @@ *.o *.a .cache/ -# Do not ignore .git directory, otherwise the reported build number will always be 0 +.git/ .github/ .gitignore .vs/ diff --git a/.ecrc b/.ecrc index c68877ec2..a3351f4e6 100644 --- a/.ecrc +++ b/.ecrc @@ -1,5 +1,5 @@ { - "Exclude": ["^\\.gitmodules$", "stb_image\\.h"], + "Exclude": ["^\\.gitmodules$"], "Disable": { "IndentSize": true } diff --git a/.editorconfig b/.editorconfig index 5d63d0a51..f88f8da67 100644 --- a/.editorconfig +++ b/.editorconfig @@ -24,27 +24,9 @@ insert_final_newline = unset [examples/server/public/*] indent_size = 2 -[examples/server/public/deps_*] -trim_trailing_whitespace = unset -indent_style = unset -indent_size = unset - -[examples/server/deps_*] -trim_trailing_whitespace = unset -indent_style = unset -indent_size = unset - [examples/llama.swiftui/llama.swiftui.xcodeproj/*] indent_style = tab [examples/cvector-generator/*.txt] trim_trailing_whitespace = unset insert_final_newline = unset - -[models/templates/*.jinja] -indent_style = unset -indent_size = unset -end_of_line = unset -charset = unset -trim_trailing_whitespace = unset -insert_final_newline = unset diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml new file mode 100644 index 000000000..54785854f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml @@ -0,0 +1,50 @@ +name: Low Severity Bugs +description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) +title: "Bug: " +labels: ["bug-unconfirmed", "low severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml deleted file mode 100644 index b85bf5741..000000000 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ /dev/null @@ -1,87 +0,0 @@ -name: Bug (compilation) -description: Something goes wrong when trying to compile llama.cpp. -title: "Compile bug: " -labels: ["bug-unconfirmed", "compilation"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for bug reports where the compilation of llama.cpp fails. - Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`. - If the compilation succeeds with ccache disabled you should be able to permanently fix the issue - by clearing `~/.cache/ccache` (on Linux). - - type: textarea - id: commit - attributes: - label: Git commit - description: Which commit are you trying to compile? - placeholder: | - $git rev-parse HEAD - 84a07a17b1b08cf2b9747c633a2372782848a27f - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: true - - type: dropdown - id: backends - attributes: - label: GGML backends - description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] - multiple: true - validations: - required: true - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it. - If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. - placeholder: > - I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY. - Here are the exact commands that I used: ... - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: command - attributes: - label: Compile command - description: > - Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - Please copy and paste any relevant log output, including any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml deleted file mode 100644 index 1ccef0793..000000000 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ /dev/null @@ -1,101 +0,0 @@ -name: Bug (model use) -description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module). -title: "Eval bug: " -labels: ["bug-unconfirmed", "model evaluation"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for bug reports where the model evaluation results - (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. - If you encountered the issue while using an external UI (e.g. ollama), - please reproduce your issue using one of the examples/binaries in this repository. - The `llama-cli` binary can be used for simple and reproducible model inference. - - type: textarea - id: version - attributes: - label: Name and Version - description: Which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: true - - type: dropdown - id: backends - attributes: - label: GGML backends - description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] - multiple: true - validations: - required: true - - type: textarea - id: hardware - attributes: - label: Hardware - description: Which CPUs/GPUs are you using? - placeholder: > - e.g. Ryzen 5950X + 2x RTX 4090 - validations: - required: true - - type: textarea - id: model - attributes: - label: Models - description: > - Which model(s) at which quantization were you using when encountering the bug? - If you downloaded a GGUF file off of Huggingface, please provide a link. - placeholder: > - e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M - validations: - required: false - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it. - If you can narrow down the bug to specific hardware, compile flags, or command line arguments, - that information would be very much appreciated by us. - placeholder: > - e.g. when I run llama-cli with -ngl 99 I get garbled outputs. - When I use -ngl 0 it works correctly. - Here are the exact commands that I used: ... - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml deleted file mode 100644 index 1904e31fd..000000000 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: Bug (misc.) -description: Something is not working the way it should (and it's not covered by any of the above cases). -title: "Misc. bug: " -labels: ["bug-unconfirmed"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for miscellaneous bugs that don't fit into any other category. - If you encountered the issue while using an external UI (e.g. ollama), - please reproduce your issue using one of the examples/binaries in this repository. - - type: textarea - id: version - attributes: - label: Name and Version - description: Which version of our software is affected? (You can use `--version` to get a version string.) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: dropdown - id: module - attributes: - label: Which llama.cpp modules do you know to be affected? - multiple: true - options: - - Documentation/Github - - libllama (core library) - - llama-cli - - llama-server - - llama-bench - - llama-quantize - - Python/Bash scripts - - Test code - - Other (Please specify in the next section) - validations: - required: false - - type: textarea - id: command - attributes: - label: Command line - description: > - Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: false - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it (if applicable). - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - If applicable, please copy and paste any relevant log output, including any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml new file mode 100644 index 000000000..a6285c6f0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml @@ -0,0 +1,50 @@ +name: Medium Severity Bug +description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) +title: "Bug: " +labels: ["bug-unconfirmed", "medium severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml new file mode 100644 index 000000000..ff816b937 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml @@ -0,0 +1,50 @@ +name: High Severity Bug +description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) +title: "Bug: " +labels: ["bug-unconfirmed", "high severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml new file mode 100644 index 000000000..7af42a80b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml @@ -0,0 +1,50 @@ +name: Critical Severity Bug +description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) +title: "Bug: " +labels: ["bug-unconfirmed", "critical severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/05-enhancement.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/020-enhancement.yml rename to .github/ISSUE_TEMPLATE/05-enhancement.yml index 02dd4f575..58fca7318 100644 --- a/.github/ISSUE_TEMPLATE/020-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml @@ -1,5 +1,5 @@ name: Enhancement -description: Used to request enhancements for llama.cpp. +description: Used to request enhancements for llama.cpp title: "Feature Request: " labels: ["enhancement"] body: diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/06-research.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/030-research.yml rename to .github/ISSUE_TEMPLATE/06-research.yml index 18975dbbf..3ae4e9f8c 100644 --- a/.github/ISSUE_TEMPLATE/030-research.yml +++ b/.github/ISSUE_TEMPLATE/06-research.yml @@ -1,5 +1,5 @@ name: Research -description: Track new technical research area. +description: Track new technical research area title: "Research: " labels: ["research 🔬"] body: diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/07-refactor.yml similarity index 95% rename from .github/ISSUE_TEMPLATE/040-refactor.yml rename to .github/ISSUE_TEMPLATE/07-refactor.yml index b6e6ab36d..3a68d3d53 100644 --- a/.github/ISSUE_TEMPLATE/040-refactor.yml +++ b/.github/ISSUE_TEMPLATE/07-refactor.yml @@ -1,5 +1,5 @@ name: Refactor (Maintainers) -description: Used to track refactoring opportunities. +description: Used to track refactoring opportunities title: "Refactor: " labels: ["refactor"] body: diff --git a/.github/labeler.yml b/.github/labeler.yml index 1b47bc968..89436740d 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,18 +3,19 @@ Kompute: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-kompute.h - - ggml/src/ggml-kompute/** + - ggml/src/ggml-kompute.cpp - README-kompute.md Apple Metal: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-metal.h - - ggml/src/ggml-metal/** + - ggml/src/ggml-metal.cpp - README-metal.md SYCL: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-sycl.h + - ggml/src/ggml-sycl.cpp - ggml/src/ggml-sycl/** - docs/backend/SYCL.md - examples/sycl/** @@ -26,8 +27,8 @@ Nvidia GPU: Vulkan: - changed-files: - any-glob-to-any-file: - - ggml/include/ggml-vulkan.h - - ggml/src/ggml-vulkan/** + - ggml/ggml_vk_generate_shaders.py + - ggml/src/ggml-vulkan* documentation: - changed-files: - any-glob-to-any-file: @@ -74,7 +75,11 @@ server: ggml: - changed-files: - any-glob-to-any-file: - - ggml/** + - ggml/include/ggml*.h + - ggml/src/ggml*.c + - ggml/src/ggml*.cpp + - ggml/src/ggml*.h + - ggml-cuda/** nix: - changed-files: - any-glob-to-any-file: diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d9f5bdc23..997c6d9d0 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1 +1,7 @@ -*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* + + +- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) +- Self-reported review complexity: + - [ ] Low + - [ ] Medium + - [ ] High diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml similarity index 95% rename from .github/workflows/bench.yml.disabled rename to .github/workflows/bench.yml index 1c8787ef7..eb69b82c4 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml @@ -1,6 +1,3 @@ -# TODO: there have been some issues with the workflow, so disabling for now -# https://github.com/ggerganov/llama.cpp/issues/7893 -# # Benchmark name: Benchmark @@ -27,10 +24,10 @@ on: push: branches: - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] pull_request_target: types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] schedule: - cron: '04 2 * * *' @@ -132,8 +129,6 @@ jobs: - name: Server bench id: server_bench - env: - HEAD_REF: ${{ github.head_ref || github.ref_name }} run: | set -eux @@ -142,7 +137,7 @@ jobs: python bench.py \ --runner-label ${{ env.RUNNER_LABEL }} \ --name ${{ github.job }} \ - --branch $HEAD_REF \ + --branch ${{ github.head_ref || github.ref_name }} \ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --scenario script.js \ --duration ${{ github.event.inputs.duration || env.DURATION }} \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6841ba589..a1e183d11 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,27 +10,19 @@ on: push: branches: - master - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} cancel-in-progress: true -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - contents: write # for creating release - env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 jobs: macOS-latest-cmake-arm64: @@ -43,12 +35,6 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-arm64 - evict-old-files: 1d - - name: Dependencies id: depends continue-on-error: true @@ -59,14 +45,10 @@ jobs: id: cmake_build run: | sysctl -a - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=ON \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + mkdir build + cd build + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test id: cmake_test @@ -92,7 +74,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - name: Upload artifacts @@ -103,7 +84,7 @@ jobs: name: llama-bin-macos-arm64.zip macOS-latest-cmake-x64: - runs-on: macos-13 + runs-on: macos-12 steps: - name: Clone @@ -112,12 +93,6 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-x64 - evict-old-files: 1d - - name: Dependencies id: depends continue-on-error: true @@ -130,12 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=ON \ - -DGGML_METAL=OFF \ - -DGGML_RPC=ON + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -162,7 +132,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - name: Upload artifacts @@ -172,8 +141,68 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip name: llama-bin-macos-x64.zip - ubuntu-cpu-cmake: - runs-on: ubuntu-22.04 + ubuntu-focal-make: + runs-on: ubuntu-20.04 + env: + LLAMA_NODE_AVAILABLE: true + LLAMA_PYTHON_AVAILABLE: true + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential gcc-8 + + - uses: actions/setup-node@v4 + with: + node-version: "20" + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Build + id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 + run: | + CC=gcc-8 make -j $(nproc) + + - name: Test + id: make_test + run: | + CC=gcc-8 make tests -j $(nproc) + make test -j $(nproc) + + ubuntu-focal-make-curl: + runs-on: ubuntu-20.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev + + - name: Build + id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 + LLAMA_CURL: 1 + run: | + CC=gcc-8 make -j $(nproc) + + ubuntu-latest-cmake: + runs-on: ubuntu-latest steps: - name: Clone @@ -182,12 +211,6 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-cpu-cmake - evict-old-files: 1d - - name: Dependencies id: depends run: | @@ -197,11 +220,10 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=ON \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(nproc) + mkdir build + cd build + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF + cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test @@ -238,7 +260,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ - cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* - name: Upload artifacts @@ -256,19 +277,13 @@ jobs: strategy: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug] + build_type: [Debug, Release] steps: - name: Clone id: checkout uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }} - evict-old-files: 1d - - name: Dependencies id: depends run: | @@ -279,52 +294,19 @@ jobs: id: cmake_build if: ${{ matrix.sanitizer != 'THREAD' }} run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + mkdir build + cd build + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Build (no OpenMP) id: cmake_build_no_openmp if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DGGML_OPENMP=OFF - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-llguidance: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build run: | mkdir build cd build - cmake .. \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_LLGUIDANCE=ON - cmake --build . --config Release -j $(nproc) + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Test id: cmake_test @@ -342,12 +324,6 @@ jobs: id: checkout uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-rpc - evict-old-files: 1d - - name: Dependencies id: depends run: | @@ -357,9 +333,10 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(nproc) + mkdir build + cd build + cmake -DGGML_RPC=ON .. + cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test @@ -375,33 +352,21 @@ jobs: id: checkout uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-vulkan - evict-old-files: 1d - - name: Dependencies id: depends run: | wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk + sudo apt-get install -y build-essential vulkan-sdk - name: Build id: cmake_build run: | - cmake -B build \ - -DGGML_VULKAN=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | + mkdir build cd build - # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 1800 + cmake -DGGML_VULKAN=ON .. + cmake --build . --config Release -j $(nproc) ubuntu-22-cmake-hip: runs-on: ubuntu-22.04 @@ -410,7 +375,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v4 + uses: actions/checkout@v3 - name: Dependencies id: depends @@ -418,64 +383,25 @@ jobs: sudo apt-get update sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-hip - evict-old-files: 1d - - name: Build with native CMake HIP support id: cmake_build run: | - cmake -B build -S . \ - -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ - -DGGML_HIP=ON + cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON cmake --build build --config Release -j $(nproc) - name: Build with legacy HIP support id: cmake_build_legacy_hip run: | - cmake -B build2 -S . \ - -DCMAKE_C_COMPILER=hipcc \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DGGML_HIP=ON + cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON cmake --build build2 --config Release -j $(nproc) - ubuntu-22-cmake-musa: - runs-on: ubuntu-22.04 - container: mthreads/musa:rc3.1.0-devel-ubuntu22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - apt-get update - apt-get install -y build-essential git cmake libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-musa - evict-old-files: 1d - - - name: Build with native CMake MUSA support - id: cmake_build - run: | - cmake -B build -S . \ - -DGGML_MUSA=ON - cmake --build build --config Release -j $(nproc) - ubuntu-22-cmake-sycl: runs-on: ubuntu-22.04 continue-on-error: true steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: add oneAPI to apt shell: bash @@ -501,21 +427,14 @@ jobs: id: checkout uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-sycl - evict-old-files: 1d - - name: Build id: cmake_build run: | source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx - cmake --build build --config Release -j $(nproc) + mkdir build + cd build + cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake --build . --config Release -j $(nproc) ubuntu-22-cmake-sycl-fp16: runs-on: ubuntu-22.04 @@ -523,7 +442,7 @@ jobs: continue-on-error: true steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: add oneAPI to apt shell: bash @@ -549,22 +468,77 @@ jobs: id: checkout uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-sycl-fp16 - evict-old-files: 1d - - name: Build id: cmake_build run: | source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DGGML_SYCL_F16=ON - cmake --build build --config Release -j $(nproc) + mkdir build + cd build + cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON .. + cmake --build . --config Release -j $(nproc) + + # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # how to debug it. + # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 + macOS-latest-make: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 + run: | + GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: make_test + run: | + GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) + + # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # how to debug it. + # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 + # would be great if we fix these + macOS-latest-cmake: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -572,13 +546,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-ios - evict-old-files: 1d + uses: actions/checkout@v1 - name: Dependencies id: depends @@ -590,8 +558,9 @@ jobs: id: cmake_build run: | sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ + mkdir build + cd build + cmake -G Xcode .. \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -599,7 +568,7 @@ jobs: -DCMAKE_SYSTEM_NAME=iOS \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO macOS-latest-cmake-tvos: runs-on: macos-latest @@ -607,13 +576,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-tvos - evict-old-files: 1d + uses: actions/checkout@v1 - name: Dependencies id: depends @@ -625,8 +588,9 @@ jobs: id: cmake_build run: | sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ + mkdir build + cd build + cmake -G Xcode .. \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -634,7 +598,7 @@ jobs: -DCMAKE_SYSTEM_NAME=tvOS \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO macOS-latest-swift: runs-on: macos-latest @@ -646,13 +610,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-swift - evict-old-files: 1d + uses: actions/checkout@v1 - name: Dependencies id: depends @@ -660,24 +618,15 @@ jobs: run: | brew update - - name: Build llama.cpp with CMake - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - sudo cmake --install build --config Release - - name: xcodebuild for swift package id: xcodebuild run: | - xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}" + xcodebuild -scheme llama -destination "${{ matrix.destination }}" + + - name: Build Swift Example + id: make_build_swift_example + run: | + make swift windows-msys2: runs-on: windows-latest @@ -693,13 +642,6 @@ jobs: - name: Clone uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-msys2 - variant: sccache - evict-old-files: 1d - - name: Setup ${{ matrix.sys }} uses: msys2/setup-msys2@v2 with: @@ -707,11 +649,25 @@ jobs: msystem: ${{matrix.sys}} install: >- base-devel - git mingw-w64-${{matrix.env}}-toolchain mingw-w64-${{matrix.env}}-cmake mingw-w64-${{matrix.env}}-openblas + - name: Build using make + shell: msys2 {0} + run: | + make -j $(nproc) + + - name: Clean after building using make + shell: msys2 {0} + run: | + make clean + + - name: Build using make w/ OpenBLAS + shell: msys2 {0} + run: | + make GGML_OPENBLAS=1 -j $(nproc) + - name: Build using CMake shell: msys2 {0} run: | @@ -730,7 +686,7 @@ jobs: cmake --build build --config ${{ matrix.build }} -j $(nproc) windows-latest-cmake: - runs-on: windows-latest + runs-on: windows-2019 env: OPENBLAS_VERSION: 0.3.23 @@ -740,26 +696,26 @@ jobs: strategy: matrix: include: + - build: 'rpc-x64' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - - build: 'llvm-arm64-opencl-adreno' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -768,18 +724,11 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-${{ matrix.build }} - variant: sccache - evict-old-files: 1d - - name: Clone Kompute submodule id: clone_kompute if: ${{ matrix.build == 'kompute-x64' }} run: | - git submodule update --init ggml/src/ggml-kompute/kompute + git submodule update --init ggml/src/kompute - name: Download OpenBLAS id: get_openblas @@ -808,26 +757,6 @@ jobs: run: | choco install ninja - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} - run: | - git clone https://github.com/KhronosGroup/OpenCL-Headers - cd OpenCL-Headers - cmake -B build ` - -DBUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build --target install - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader - cd OpenCL-ICD-Loader - cmake -B build-arm64-release ` - -A arm64 ` - -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build-arm64-release --target install --config release - - name: Build id: cmake_build run: | @@ -857,7 +786,7 @@ jobs: - name: Test id: cmake_test # not all machines have native AVX-512 - if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} + if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} run: | cd build ctest -L main -C Release --verbose --timeout 900 @@ -893,7 +822,6 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt - Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - name: Upload artifacts @@ -903,47 +831,12 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip - ubuntu-latest-cmake-cuda: - runs-on: ubuntu-latest - container: nvidia/cuda:12.6.2-devel-ubuntu24.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install dependencies - env: - DEBIAN_FRONTEND: noninteractive - run: | - apt update - apt install -y cmake build-essential ninja-build libgomp1 git - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-cuda - evict-old-files: 1d - - - name: Build with CMake - run: | - cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CUDA_ARCHITECTURES=89-real \ - -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_NATIVE=OFF \ - -DGGML_CUDA=ON - cmake --build build - - windows-2019-cmake-cuda: + windows-latest-cmake-cuda: runs-on: windows-2019 strategy: matrix: - cuda: ['12.4', '11.7'] + cuda: ['12.2.0', '11.7.1'] build: ['cuda'] steps: @@ -951,89 +844,23 @@ jobs: id: checkout uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 0 - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 + - name: Install CUDA toolkit + id: cuda-toolkit + uses: Jimver/cuda-toolkit@v0.2.15 with: - key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }} - variant: sccache - evict-old-files: 1d - - - name: Install Cuda Toolkit 11.7 - if: ${{ matrix.cuda == '11.7' }} - run: | - mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" - choco install unzip -y - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip" - unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - - - name: Install Cuda Toolkit 12.4 - if: ${{ matrix.cuda == '12.4' }} - run: | - mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" - choco install unzip -y - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" - unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja + cuda: ${{ matrix.cuda }} + method: 'network' + sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' - name: Build id: cmake_build - shell: cmd run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -S . -B build -G "Ninja Multi-Config" ^ - -DLLAMA_BUILD_SERVER=ON ^ - -DGGML_NATIVE=OFF ^ - -DGGML_CUDA=ON ^ - -DGGML_RPC=ON - set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 - cmake --build build --config Release -j %NINJA_JOBS% -t ggml - cmake --build build --config Release + mkdir build + cd build + cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) - name: Determine tag name id: tag @@ -1062,12 +889,10 @@ jobs: name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: Copy and pack Cuda runtime - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} run: | - echo "Cuda install location: ${{ env.CUDA_PATH }}" + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" $dst='.\build\bin\cudart\' - robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll + robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* - name: Upload Cuda runtime @@ -1085,8 +910,8 @@ jobs: shell: bash env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" steps: - name: Clone @@ -1095,16 +920,8 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-sycl - variant: sccache - evict-old-files: 1d - - name: Install - run: | - scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - name: Build id: cmake_build @@ -1123,33 +940,24 @@ jobs: echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT fi - - name: Build the release package + - name: Pack artifacts id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin - echo "cp oneAPI running time dll files to ./build/bin done" 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - - name: Upload the release package + - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: @@ -1157,75 +965,19 @@ jobs: name: llama-bin-win-sycl-x64.zip windows-latest-cmake-hip: - if: ${{ github.event.inputs.create_release != 'true' }} runs-on: windows-latest steps: - name: Clone id: checkout - uses: actions/checkout@v4 + uses: actions/checkout@v3 - name: Install id: depends run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ${{ github.job }} - evict-old-files: 1d - - - name: Build - id: cmake_build - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_BUILD_TYPE=Release ` - -DGGML_HIP=ON ` - -DGGML_RPC=ON - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - - windows-latest-cmake-hip-release: - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - runs-on: windows-latest - - strategy: - matrix: - gpu_target: [gfx1100, gfx1101, gfx1030] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-hip-release - evict-old-files: 1d - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait write-host "Completed AMD HIP SDK installation" @@ -1240,42 +992,8 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_BUILD_TYPE=Release ` - -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` - -DGGML_HIP=ON ` - -DGGML_RPC=ON - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - md "build\bin\rocblas\library\" - cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - run: | - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON + cmake --build build --config Release ios-xcode-build: runs-on: macos-latest @@ -1284,27 +1002,6 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - sudo cmake --install build --config Release - - - name: xcodebuild for swift package - id: xcodebuild - run: | - xcodebuild -scheme llama-Package -destination 'generic/platform=iOS' - - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build @@ -1315,12 +1012,6 @@ jobs: - name: Clone uses: actions/checkout@v4 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: android-build - evict-old-files: 1d - - name: Set up JDK uses: actions/setup-java@v3 with: @@ -1338,16 +1029,35 @@ jobs: ./gradlew build --no-daemon +# freeBSD-latest: +# runs-on: macos-12 +# steps: +# - name: Clone +# uses: actions/checkout@v4 +# +# - name: Build +# uses: cross-platform-actions/action@v0.19.0 +# with: +# operating_system: freebsd +# version: '13.2' +# hypervisor: 'qemu' +# run: | +# sudo pkg update +# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas +# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu` + release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} runs-on: ubuntu-latest needs: - - ubuntu-cpu-cmake + - ubuntu-focal-make + - ubuntu-latest-cmake + - macOS-latest-make + - macOS-latest-cmake - windows-latest-cmake - - windows-2019-cmake-cuda - - windows-latest-cmake-hip-release + - windows-latest-cmake-cuda - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 @@ -1358,12 +1068,6 @@ jobs: with: fetch-depth: 0 - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: release - evict-old-files: 1d - - name: Determine tag name id: tag shell: bash @@ -1389,7 +1093,7 @@ jobs: - name: Create release id: create_release - uses: ggml-org/action-create-release@v1 + uses: anzz1/action-create-release@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: @@ -1609,37 +1313,3 @@ jobs: # popd # emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} # make - - openEuler-latest-cmake-cann: - if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }} - defaults: - run: - shell: bash -el {0} - runs-on: ubuntu-24.04-arm - strategy: - matrix: - cann: - - '8.0.rc3.beta1-910b-openeuler22.03-py3.10' - device: - - 'ascend910b3' - build: - - 'Release' - container: ascendai/cann:${{ matrix.cann }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Dependencies - run: | - yum update -y - yum install -y git gcc gcc-c++ make cmake - - - name: Build - run: | - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ - -DGGML_CANN=on \ - -DSOC_TYPE=${{ matrix.device }} - cmake --build build -j $(nproc) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index 276a217d4..69c9f4f69 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -3,11 +3,6 @@ on: schedule: - cron: "42 0 * * *" -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - issues: write - jobs: close-issues: runs-on: ubuntu-latest @@ -17,7 +12,7 @@ jobs: steps: - uses: actions/stale@v5 with: - exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap" + exempt-issue-labels: "refactor,help wanted,good first issue,research,bug" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6955a7dc8..bf94b2024 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -10,50 +10,48 @@ name: Publish Docker image on: - workflow_dispatch: # allows manual triggering - schedule: - # Rebuild daily rather than on every push because it is expensive - - cron: '12 4 * * *' + #pull_request: + push: + branches: + - master + paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} cancel-in-progress: true -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - packages: write - jobs: push_to_registry: name: Push Docker image to Docker Hub + #if: github.event.pull_request.draft == false - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest env: COMMIT_SHA: ${{ github.sha }} strategy: - fail-fast: false matrix: config: - # Multi-stage build - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} - - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true } + - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete. + #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo uses: actions/checkout@v4 - with: - fetch-depth: 0 # preserve git history, so we can determine the build number - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v2 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v2 - name: Log in to Docker Hub uses: docker/login-action@v2 @@ -62,45 +60,9 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case - REPO_NAME="${{ github.event.repository.name }}" - - # determine tag name postfix (build number, commit hash) - if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then - TAG_POSTFIX="-b${BUILD_NUMBER}" - else - SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') - TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}" - fi - # list all tags possible - if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then - TYPE="" - else - TYPE="-${{ matrix.config.tag }}" - fi - PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" - FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}" - LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}" - SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}" - echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT - echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT - echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT - echo "full_output_tags=$FULLTAGS" # print out for debugging - echo "light_output_tags=$LIGHTTAGS" # print out for debugging - echo "server_output_tags=$SERVERTAGS" # print out for debugging - env: - GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - + # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) - if: ${{ matrix.config.free_disk_space == true }} - uses: ggml-org/free-disk-space@v1.3.1 + uses: jlumbroso/free-disk-space@main with: # this might remove tools that are actually needed, # if set to "true" but frees about 6 GB @@ -115,59 +77,40 @@ jobs: docker-images: true swap-storage: true - - name: Build and push Full Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.full_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: full - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi - - name: Build and push Light Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.light_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: light - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache + - name: Downcase github.repository_owner + run: | + echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV + env: + GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - - name: Build and push Server Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} - uses: docker/build-push-action@v6 + - name: Build and push Docker image (versioned) + if: github.event_name == 'push' + uses: docker/build-push-action@v4 with: context: . push: true platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.server_output_tags }} + tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + file: ${{ matrix.config.dockerfile }} + + - name: Build and push Docker image (tagged) + uses: docker/build-push-action@v4 + with: + context: . + push: ${{ github.event_name == 'push' }} + platforms: ${{ matrix.config.platforms }} + tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" file: ${{ matrix.config.dockerfile }} - target: server - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index f02b7c219..ae86e9927 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -23,7 +23,5 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@v2 - with: - version: v3.0.3 + - uses: editorconfig-checker/action-editorconfig-checker@main - run: editorconfig-checker diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml new file mode 100644 index 000000000..4aa4b2379 --- /dev/null +++ b/.github/workflows/nix-ci-aarch64.yml @@ -0,0 +1,65 @@ +name: Nix aarch64 builds + +on: + workflow_dispatch: # allows manual triggering + schedule: + # Rebuild daily rather than on every push because QEMU is expensive (e.g. + # 1.5h instead of minutes with the cold cache). + # + # randint(0, 59), randint(0, 23) + - cron: '26 12 * * *' + # But also rebuild if we touched any of the Nix expressions: + push: + branches: + - master + paths: ['**/*.nix', 'flake.lock'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/*.nix', 'flake.lock'] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + nix-build-aarch64: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install QEMU + # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 + run: | + sudo apt-get update + sudo apt-get install -y qemu-user-static qemu-system-aarch64 + sudo usermod -a -G kvm $USER + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-platforms = aarch64-linux + extra-system-features = nixos-test kvm + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: Set-up cachix to push the results to + uses: cachix/cachix-action@v13 + with: + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + name: llama-cpp + - name: Show all output paths + run: > + nix run github:nix-community/nix-eval-jobs + -- --gc-roots-dir gcroot + --flake + ".#packages.aarch64-linux" + - name: Build + run: > + nix run github:Mic92/nix-fast-build + -- --skip-cached --no-nom + --systems aarch64-linux + --flake + ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml new file mode 100644 index 000000000..8955f38d0 --- /dev/null +++ b/.github/workflows/nix-ci.yml @@ -0,0 +1,72 @@ +name: Nix CI + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + pull_request: + types: [opened, synchronize, reopened] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + nix-eval: + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: List all flake outputs + run: nix flake show --all-systems + - name: Show all output paths + run: > + nix run github:nix-community/nix-eval-jobs + -- --gc-roots-dir gcroot + --flake + ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" + nix-build: + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: Set-up cachix to push the results to + uses: cachix/cachix-action@v13 + with: + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + name: llama-cpp + - name: Build + run: > + nix run github:Mic92/nix-fast-build + -- --skip-cached --no-nom + --flake + ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)" diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml new file mode 100644 index 000000000..3a6a96e26 --- /dev/null +++ b/.github/workflows/nix-flake-update.yml @@ -0,0 +1,22 @@ +name: update-flake-lock +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00 + +jobs: + lockfile: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@main + - name: Update flake.lock + uses: DeterminateSystems/update-flake-lock@main + with: + pr-title: "nix: update flake.lock" + pr-labels: | + nix + pr-reviewers: philiptaron,SomeoneSerge + token: ${{ secrets.FLAKE_TOKEN }} diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml new file mode 100644 index 000000000..2c3c1ebda --- /dev/null +++ b/.github/workflows/nix-publish-flake.yml @@ -0,0 +1,36 @@ +# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes +name: "Publish a flake to flakestry & flakehub" +on: + push: + tags: + - "*" + workflow_dispatch: + inputs: + tag: + description: "The existing tag to publish" + type: "string" + required: true +jobs: + flakestry-publish: + runs-on: ubuntu-latest + permissions: + id-token: "write" + contents: "read" + steps: + - uses: flakestry/flakestry-publish@main + with: + version: "${{ inputs.tag || github.ref_name }}" + flakehub-publish: + runs-on: "ubuntu-latest" + permissions: + id-token: "write" + contents: "read" + steps: + - uses: "actions/checkout@v4" + with: + ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}" + - uses: "DeterminateSystems/nix-installer-action@main" + - uses: "DeterminateSystems/flakehub-push@main" + with: + visibility: "public" + tag: "${{ inputs.tag }}" diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 46e80aecd..4e0374fc6 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -6,13 +6,15 @@ on: - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - - '**/requirements*.txt' + - 'requirements.txt' + - 'requirements/*.txt' pull_request: paths: - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - - '**/requirements*.txt' + - 'requirements.txt' + - 'requirements/*.txt' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index ddfdf73b8..a8d46f31d 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -1,13 +1,6 @@ name: flake8 Lint -on: - push: - branches: - - master - paths: ['.github/workflows/python-lint.yml', '**/*.py'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/python-lint.yml', '**/*.py'] +on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index 373bb6010..e5ff5e6d7 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -4,13 +4,11 @@ on: push: paths: - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' - '**.py' - '**/requirements*.txt' pull_request: paths: - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' - '**.py' - '**/requirements*.txt' @@ -35,6 +33,6 @@ jobs: - name: Type-check with Pyright uses: jakebailey/pyright-action@v2 with: - version: 1.1.382 + version: 1.1.370 level: warning warnings: true diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 3a29107d0..99feb28f2 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -20,12 +20,6 @@ on: types: [opened, synchronize, reopened] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] -env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 - concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true @@ -76,49 +70,20 @@ jobs: run: | pip install -r examples/server/tests/requirements.txt - # Setup nodejs (to be used for verifying bundled index.html) - - uses: actions/setup-node@v4 - with: - node-version: '22.11.0' - - - name: WebUI - Install dependencies - id: webui_lint - run: | - cd examples/server/webui - npm ci - - - name: WebUI - Check code format - id: webui_format + - name: Verify server deps + id: verify_server_deps run: | git config --global --add safe.directory $(realpath .) - cd examples/server/webui + cd examples/server + git ls-files --others --modified git status - - npm run format + ./deps.sh git status - modified_files="$(git status -s)" - echo "Modified files: ${modified_files}" - if [ -n "${modified_files}" ]; then - echo "Files do not follow coding style. To fix: npm run format" - echo "${modified_files}" - exit 1 - fi - - - name: Verify bundled index.html - id: verify_server_index_html - run: | - git config --global --add safe.directory $(realpath .) - cd examples/server/webui - git status - - npm run build - git status - modified_files="$(git status -s)" - echo "Modified files: ${modified_files}" - if [ -n "${modified_files}" ]; then - echo "Repository is dirty or server/webui is not built as expected" - echo "Hint: You may need to follow Web UI build guide in server/README.md" - echo "${modified_files}" + not_ignored_files="$(git ls-files --others --modified)" + echo "Modified files: ${not_ignored_files}" + if [ -n "${not_ignored_files}" ]; then + echo "Repository is dirty or server deps are not built as expected" + echo "${not_ignored_files}" exit 1 fi @@ -135,9 +100,9 @@ jobs: -DGGML_OPENMP=OFF ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - name: Build (sanitizers) - id: cmake_build_sanitizers - if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }} + - name: Build + id: cmake_build + if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ -DGGML_NATIVE=OFF \ @@ -147,37 +112,18 @@ jobs: -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - name: Build (sanitizers) - id: cmake_build - if: ${{ matrix.sanitizer == '' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - name: Tests id: server_integration_tests - if: ${{ matrix.sanitizer == '' }} run: | cd examples/server/tests - ./tests.sh - - - name: Tests (sanitizers) - id: server_integration_tests_sanitizers - if: ${{ matrix.sanitizer != '' }} - run: | - cd examples/server/tests - LLAMA_SANITIZE=1 ./tests.sh + PORT=8888 ./tests.sh - name: Slow tests id: server_integration_tests_slow if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests - SLOW_TESTS=1 ./tests.sh + PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow server-windows: @@ -227,13 +173,11 @@ jobs: if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests - $env:PYTHONIOENCODING = ":replace" - pytest -v -x -m "not slow" + behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests id: server_integration_tests_slow if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests - $env:SLOW_TESTS = "1" - pytest -v -x + behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/.gitignore b/.gitignore index 694f36e04..7c7dee0c6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ *.a *.bat *.bin -*.d *.dll *.dot *.etag @@ -18,7 +17,6 @@ *.metallib *.o *.so -*.swp *.tmp # IDE / OS @@ -52,7 +50,6 @@ build* !docs/build.md /libllama.so /llama-* -/vulkan-shaders-gen android-ndk-* arm_neon.h cmake-build-* @@ -63,7 +60,6 @@ llama-batched-swift /rpc-server out/ tmp/ -autogen-*.md # Deprecated @@ -82,6 +78,7 @@ models-mnt !models/ggml-vocab-*.gguf* # Zig + zig-out/ zig-cache/ @@ -105,10 +102,6 @@ examples/server/*.mjs.hpp !examples/sycl/*.bat !examples/sycl/*.sh -# Server Web UI temporary files -node_modules -examples/server/webui/dist - # Python /.venv @@ -136,10 +129,3 @@ poetry.toml # Scripts !/scripts/install-oneapi.bat - -# Test models for lora adapters -/lora-tests - -# Local scripts -/run-vim.sh -/run-chat.sh diff --git a/.gitmodules b/.gitmodules index 23ce5ff05..5861d59cb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "kompute"] - path = ggml/src/ggml-kompute/kompute + path = ggml/src/kompute url = https://github.com/nomic-ai/kompute.git diff --git a/AUTHORS b/AUTHORS index 6796b2941..1bd36158a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# date: Tue Feb 4 13:04:05 EET 2025 +# date: Wed Jun 26 19:36:34 EEST 2024 # this file is auto-generated by scripts/gen-authors.sh 0cc4m @@ -7,7 +7,6 @@ 2f38b454 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 44670 <44670@users.noreply.github.com> -65a <10104049+65a@users.noreply.github.com> AN Long AT Aarni Koskela @@ -20,30 +19,20 @@ Adithya Balaji AdithyanI Adrian Adrian Hesketh -Adrien Gallouët -Adrien Gallouët -Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> Ahmet Zeer AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> -AidanBeltonS Aisuko -Akarshan Biswas Akarshan Biswas -Al Mochkin <14274697+amochkin@users.noreply.github.com> Albert Jin Alberto <57916483+albbus-stack@users.noreply.github.com> -Alberto Cabrera Pérez -Alberto Cabrera Pérez Alex Alex Azarov Alex Azarov Alex Klinkhamer Alex Klinkhamer Alex Nguyen -Alex O'Connell <35843486+acon96@users.noreply.github.com> Alex Petenchea Alex Renda -Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com> Alex von Gluck IV Alexey Parfenov Ali Chraghi <63465728+alichraghi@users.noreply.github.com> @@ -56,26 +45,18 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Ananta Bastola Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> András Salamon -Andreas (Andi) Kunar -Andreas Kieslinger <47689530+aendk@users.noreply.github.com> Andrei Andrew Canis Andrew Downing Andrew Duffy Andrew Godfrey -Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com> -Andy Salerno Andy Tai -Anthony Van de Gejuchte -Antonis Makropoulos Arik Poznanski -Armen Kaleshian Artem Artem Zinnatullin Artyom Lebedev Asbjørn Olling Ásgeir Bjarni Ingvarsson -Asghar Ghorbani Ashish <1856117+ashishdatta@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashraful Islam @@ -94,21 +75,13 @@ Ben Siraphob Ben Williams Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> -Benson Wong Bernat Vadell -Bernhard M. Wiedemann -Bert Wagner -Billel Mokeddem Bingan <70050083+binganao@users.noreply.github.com> -Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov -Borislav Stanimirov Branden Butler -Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Brian -Brian Cunnie Bruce MacDonald Bryan Honof CJ Pais @@ -117,51 +90,32 @@ Calvin Laurenson Cameron Cameron Kaiser Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> -CarryFun <76023481+CarryFun@users.noreply.github.com> -Carsten Kragelund Jørgensen -CarterLi999 <664681047@qq.com> Casey Primozic Casey Primozic CausalLM <148736309+CausalLM@users.noreply.github.com> Cebtenzzre -CentricStorm Chad Brewbaker -Changyeon Kim Chao Jiang -Charles Xu <63788048+chaxu01@users.noreply.github.com> -Charles Xu -Chen Xi -Chen Xi Cheng Shao -Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Chris Elrod Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> -Christian Kastner Christian Kögler -Christian Köhnenkamp Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> -Christopher Nielsen <62156882+mascguy@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> Clint Herron -Conrad Kramer -Corentin REGAL CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> -Csaba Kecskemeti Cuong Trinh Manh DAN™ Damian Stewart -Dan Johansson <164997844+eddnjjn@users.noreply.github.com> -Dan Johansson Dane Madsen DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> Daniel Bevenius Daniel Drake Daniel Hiltgen Daniel Illescas Romero -Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Daniele <57776841+daniandtheweb@users.noreply.github.com> DannyDaemonic Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> @@ -175,29 +129,19 @@ David Pflug David Renshaw David Sommers <12738+databyte@users.noreply.github.com> David Yang -DavidKorczynski Dawid Potocki Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dean Deins -Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com> -Derrick T. Woolworth Deven Mistry <31466137+deven367@users.noreply.github.com> -Dibakar Gope Didzis Gosko -Diego Devesa -Diogo Teles Sant'Anna -Djip007 <3705339+Djip007@users.noreply.github.com> Djip007 Don Mahurin DooWoong Lee (David) Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> -Dou Xinpeng <15529241576@163.com> -Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com> Douglas Hanley Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Ebey Abraham -Echo Nolan Ed Lee Ed Lepedus Eddie-Wang @@ -205,16 +149,12 @@ Edward Taylor Elaine Elbios <141279586+Elbios@users.noreply.github.com> Elton Kola -Emreerdog <34742675+Emreerdog@users.noreply.github.com> Engininja2 <139037756+Engininja2@users.noreply.github.com> Equim -Eric Curtin -Eric Curtin Eric Sommerlade Eric Zhang <34133756+EZForever@users.noreply.github.com> Erik Garrison Erik Scholz -Esko Toivonen Ettore Di Giacinto Evan Jones Evan Miller @@ -226,27 +166,19 @@ FK Fabian Fabio R. Sluzala Faez Shakil -Faisal Zaghloul -Faisal Zaghloul -Fan Shupei FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> -Farbod Bijary <110523279+farbodbj@users.noreply.github.com> Fattire <528174+fat-tire@users.noreply.github.com> Felix Finn Voorhees Firat -FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com> Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com> Frank Mai FrankHB -Frankie Robertson Fred Douglas <43351173+fredlas@users.noreply.github.com> Frederik Vogel Gabe Goodhart -Gabe Goodhart -Gaetan Bisson GainLee Galunid Gary Linscott @@ -255,15 +187,12 @@ Gavin Zhao Genkagaku.GPT Georgi Gerganov Gilad S -Gilad S. <7817232+giladgd@users.noreply.github.com> Giuseppe Scrivano GiviMAD Govlzkoy Guillaume "Vermeille" Sanchez Guillaume Wenzek -Guoliang Hua <32868157+nbcsm@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com> -Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Haggai Nuchi Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> @@ -274,47 +203,35 @@ Haoxiang Fei Harald Fernengel Hatsune Miku <129688334+at8u@users.noreply.github.com> HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> -Haus1 Henk Poley Henri Vasserman Henrik Forstén Herman Semenov Hesen Peng -HimariO Hoang Nguyen Hong Bo PENG Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Howard Su Hua Jiang -Huang Qi Huawei Lin Hugo Roussel -Huifeng Ou <79071290+ho2103@users.noreply.github.com> Ian Bull Ian Bull Ian Scrivener -Icecream95 Ido S IgnacioFDM Igor Okulist -Ihar Hrachyshka Ikko Eltociear Ashimine Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Ionoclast Laboratories Isaac McFadyen IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> -Ivan -Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Komarov Ivan Stepanov -JFLFY2255 JH23X <165871467+JH23X@users.noreply.github.com> -Jack Mousseau Jack Mousseau JackJollimore <130917767+JackJollimore@users.noreply.github.com> -Jaeden Amero Jaemin Son -Jafar Uruç Jag Chadha Jakub N James A Capozzoli <157492257+jac-jim@users.noreply.github.com> @@ -326,16 +243,11 @@ Jannis Schönleber Jared Van Bortel Jared Van Bortel Jason McCartney -Jason Stillerman Jean-Christophe Hoelt Jean-Michaël Celerier Jed Fox -Jeff Bolz -Jeffrey Morgan Jeffrey Quesnelle -Jeroen Mostert Jesse Jojo Johnson -Jett Janiak Jeximo Jhen-Jie Hong Jiahao Li @@ -346,9 +258,6 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com> Jiří Sejkora Joan Fontanals Joan Fontanals -João Dinis Ferreira -Joe Eli McIlvain -Joe Todd Johan Johannes Gäßler Johannes Rudolph @@ -364,11 +273,8 @@ Josh Ramer Joyce Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Judd -Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Julius Arkenberg -Jun Hee Yoo Jun Jie <71215065+junnjiee16@users.noreply.github.com> -Junil Kim Junyang Lin Juraj Bednar Justin Parker @@ -379,7 +285,6 @@ Justine Tunney Juuso Alasuutari KASR Kamil Tomšík -Karol Kontny <82021046+kkontny@users.noreply.github.com> Karsten Weiss Karthick Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> @@ -387,19 +292,16 @@ Karthik Sethuraman Kasumi <90275229+kasumi-1@users.noreply.github.com> Kawrakow <48489457+ikawrakow@users.noreply.github.com> Keiichi Tabata -Keke Han Kenvix ⭐ Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Kevin Gibbons Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Kwok Kevin Lo -Kevin Wang Kolen Cheung Konstantin Herud Konstantin Zhuravlyov Kunshang Ji -Kyle Bruene Kyle Liang Kyle Mistele Kylin <56434533+KyL0N@users.noreply.github.com> @@ -413,30 +315,22 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> Leonardo Neumann Li Tan Linwei Wang -Liu Jia <109258120+Septa2112@users.noreply.github.com> -Liu Jia LoganDark -Loïc Carrère LostRuins <39025047+LostRuins@users.noreply.github.com> -LostRuins Concedo <39025047+LostRuins@users.noreply.github.com> Luciano Luo Tian Lyle Dean -M-A M. Yusuf Sarıgöz -Ma Mingfei Maarten ter Huurne Mack Straight Maël Kerbiriou MaggotHATE -Mahesh Madhav <67384846+heshpdx@users.noreply.github.com> Manuel <44313466+makuche@users.noreply.github.com> Marc Köhlbrugge Marco Matthies <71844+marcom@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marian Cepok Mark Fairbairn -Mark Zhuang Marko Tasic Markus Tavenrath Martin Delille @@ -448,16 +342,11 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Mateusz Charytoniuk Matheus C. França Matheus Gabriel Alves Silva -Mathieu Baudier -Mathieu Geli Mathieu Nayrolles -Mathijs Henquet Mathijs de Bruin Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Pulver -Matt Stephenson Matteo Boschini <12133566+mbosc@users.noreply.github.com> -Matteo Mortari Mattheus Chediak Matthew Tejo Matvey Soloviev @@ -467,11 +356,8 @@ Maxime <672982+maximegmd@users.noreply.github.com> Maximilian Winter Meng Zhang Meng, Hengyu -Mengqing Cao Merrick Christensen Michael Coppola -Michael Engel -Michael Francis Michael Hueschen Michael Kesper Michael Klimenko @@ -479,81 +365,52 @@ Michael Podvitskiy Michael Potter Michael de Gans Michaël de Vries -Michał Moskal -Michał Tuszyński -Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com> Mihai Mike Mikko Juola Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> -Minsoo Cheong Mirko185 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> -MistApproach <98988043+MistApproach@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Mohammadreza Hendiani Mohammadreza Hendiani -Molly Sophia -MorganRO8 <47795945+MorganRO8@users.noreply.github.com> Murilo Santana Musab Gultekin Nam D. Tran <42194884+namtranase@users.noreply.github.com> Nathan Epstein -Natsu NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> Nebula Neo Zhang <14088817+arthw@users.noreply.github.com> Neo Zhang Neo Zhang Jianyu Neuman Vong -NeverLucky <92274250+nvrxq@users.noreply.github.com> -Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com> -Nicholai Tukanov -Nico Bosshard Nicolai Weitkemper Nicolás Pérez -Nicolò Scipione Nigel Bosch -Nikita Sarychev <42014488+sARY77@users.noreply.github.com> Niklas Korz -NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> -Nikolaos Pothitos Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth -Nuno -OSecret <135510162+OLSecret@users.noreply.github.com> Oleksandr Nikitin Oleksii Maryshchenko Olivier Chafik Ondřej Čertík Ouadie EL FAROUKI -PAB -Pablo Duboue -Pascal Patry Patrice Ferlet Paul Tsochantaris -Pavel Zloi Pavol Rusnak -Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com> Pedro Cuenca -Peter Peter Sugihara Phil H <5756783+phiharri@users.noreply.github.com> Philip Taron Phillip Kravtsov Pierre Alexandre SCHEMBRI Pierrick Hymbert -Pieter Ouwerkerk -Plamen Minev -Prashant Vithule <119530321+Vithulep@users.noreply.github.com> Przemysław Pawełczyk Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Qingyou Meng Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> -R0CKSTAR -R0CKSTAR RJ Adriaansen Radoslav Gerganov Radosław Gryta @@ -562,16 +419,11 @@ Raj Hammeer Singh Hada Ralph Soika Rand Xie Randall Fitzgerald -Random Fly Reinforce-II -Rémy Oudompheng Ren Xuancheng Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> -Reza Kakhki RhinoDevel -Riccardo Orlando Riceball LEE -Rich Dougherty Richard Kiss Richard Roberson Rick G <26732651+TheFlipbook@users.noreply.github.com> @@ -582,39 +434,26 @@ Riley Stewart Rinne Rinne Robert Brisita <986796+rbrisita@users.noreply.github.com> -Robert Collins -Robert Ormandi <52251610+ormandi@users.noreply.github.com> Robert Sung-wook Shin Robey Holderith Robyn Roger Meier Roland <14355895+rbur0425@users.noreply.github.com> -Romain Biessy Romain D <90720+Artefact2@users.noreply.github.com> Romain Neutron Roman Parykin Ron Evans Ron Jailall -Roni Ronny Brendel Ronsor Rowan Hart -Ruan <47767371+ruanych@users.noreply.github.com> -Ruchira Hasaranga -Rudi Servo -Ruixin Huang <18860020911@163.com> Rune <43761327+Rune-AI@users.noreply.github.com> -RunningLeon -RunningLeon Ryan Landay Ryder Wishart Ryuei Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> -SRHMorris <69468379+SRHMorris@users.noreply.github.com> -SXX SakuraUmi Salvador E. Tropea -Salvatore Mesoraca Sam Spilsbury Sami Farin <3876865+Safari77@users.noreply.github.com> Samuel Maynard @@ -624,29 +463,23 @@ Sebastián A SebastianApel <13675545+SebastianApel@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com> Sergey Alirzaev -Sergio López Sergio López Sertaç Özercan <852750+sozercan@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> ShadovvBeast Shakhar Dasgupta -Shane A Shangning Xu <32517059+xushangning@users.noreply.github.com> -Shankar -Shanshan Shen <467638484@qq.com> Shijie <821898965@qq.com> Shintarou Okada Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu Shuichi Tsutsumi -Shupei Fan Sigbjørn Skjæret Simon Willison Siwen Yu Sky Yan Slaren <2141330+slaren@users.noreply.github.com> Slava Primenko -Small Grass Forest SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> Someone Someone Serge @@ -658,33 +491,25 @@ Stefan Sydow Steffen Röcker Stephan Walter Stephen Nichols -Steve Bonds Steve Grubb Steven Prichard Steven Roussey Steward Garcia <57494570+FSSRepo@users.noreply.github.com> -StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> -Sukriti Sharma SuperUserNameMan -Sutou Kouhei Tai Duc Nguyen Taikono-Himazin Tameem <113388789+AhmadTameem@users.noreply.github.com> Tamotsu Takahashi -Tei Home Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com> Thatcher Chamberlin Theia Vogel Thérence <13496987+Royalphax@users.noreply.github.com> Thibault Terrasson Thomas Klausner -Thorsten Sommer Tim Miller -Tim Wang Timmy Knight Timothy Cronin <40186632+4imothy@users.noreply.github.com> -Ting Lou Ting Lou Ting Sun Tobias Lütke @@ -692,44 +517,32 @@ Tom C Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tomas Tomáš Pazdiora -Tony Wasserka <4840017+neobrain@users.noreply.github.com> Tristan Druyen Tristan Ross -Trivikram Kamat <16024985+trivikr@users.noreply.github.com> Tungsten842 <886724vf@anonaddy.me> Tungsten842 Tushar UEXTM.com <84163508+uextm@users.noreply.github.com> -Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com> Ulrich Drepper Uzo Nweke Vaibhav Srivastav Val Kharitonov Valentin Konovalov -Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com> Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> -Vali Malinoiu <0x4139@gmail.com> Victor Nogueira Victor Z. Peng -Viet-Anh NGUYEN (Andrew) -Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> Vlad Vladimir Vladimir Malyutin Vladimir Zorin -VoidIsVoid <343750470@qq.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> -Wang Qin <37098874+wangqin0@users.noreply.github.com> -Wang Ran (汪然) WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> Weird Constructor Welby Seely Wentai Zhang WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> William Tambellini -William Tambellini Willy Tarreau -Woof Dog <197125663+woof-dog@users.noreply.github.com> Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Wu Jian Ping Wu Jian Ping @@ -738,25 +551,15 @@ Xiang (Kevin) Li Xiao-Yong Jin XiaotaoChen Xiaoyi Chen -Xie Yanbo Xingchen Song(宋星辰) -Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com> Xuan Son Nguyen -Xuan-Son Nguyen -Yaiko Yann Follet <131855179+YannFollet@users.noreply.github.com> Yaroslav Yazan Agha-Schrader Yiming Cui Yishuo Wang -Yoshi Suhara -Yoshi Suhara -Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> -Yüg Yui -Yun Dou -Yuri Khrustalev Yusuf Kağan Hanoğlu Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> ZHAOKAI WANG @@ -765,27 +568,19 @@ Zay <95888118+isaiahbjork@users.noreply.github.com> Zenix Zhang Peiyuan Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> -Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> -Zhiyuan Li -Zhiyuan Li ZhouYuChen Ziad Ben Hadj-Alouane Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Zsapi a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> -a3sh <38979186+A3shTnT@users.noreply.github.com> adel boussaken afrideva <95653597+afrideva@users.noreply.github.com> -ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com> agray3 akawrykow <142945436+akawrykow@users.noreply.github.com> -alek3y <44779186+alek3y@users.noreply.github.com> alexpinel <93524949+alexpinel@users.noreply.github.com> alonfaraj alwqx -amd-dwang amd-lalithnc -amritahs-ibm andrijdavid anon998 <131767832+anon998@users.noreply.github.com> anzz1 @@ -793,31 +588,24 @@ apaz apcameron <37645737+apcameron@users.noreply.github.com> arch-btw <57669023+arch-btw@users.noreply.github.com> arcrank -ardfork <134447697+ardfork@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> -aryantandon01 <80969509+aryantandon01@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com> automaticcat -awatuna <23447591+awatuna@users.noreply.github.com> -b4b4o bandoti <141645996+bandoti@users.noreply.github.com> beiller bhubbb <79117352+bhubbb@users.noreply.github.com> bmwl bobqianic <129547291+bobqianic@users.noreply.github.com> -brucepro bryanSwk <93190252+bryanSwk@users.noreply.github.com> bsilvereagle bssrdf byte-6174 <88070277+byte-6174@users.noreply.github.com> -cduk <19917266+cduk@users.noreply.github.com> cebtenzzre chaihahaha chiranko <96988916+chiranko@users.noreply.github.com> clibdev <52199778+clibdev@users.noreply.github.com> clyang cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> -codezjx coezbek comex compilade <113953597+compilade@users.noreply.github.com> @@ -826,14 +614,10 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com> crasm crasm daboe01 -daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com> -daminho <37615795+daminho@users.noreply.github.com> david raistrick ddh0 ddpasa <112642920+ddpasa@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> -devojony <61173062+devojony@users.noreply.github.com> -ditsuke divinity76 dm4 dotpy314 <33351922+dotpy314@users.noreply.github.com> @@ -841,25 +625,18 @@ drbh ds5t5 <145942675+ds5t5@users.noreply.github.com> dylan eastriver -ebraminio ebraminio eiery <19350831+eiery@users.noreply.github.com> eric8607242 fairydreaming <166155368+fairydreaming@users.noreply.github.com> -fengerhu1 <2748250768@qq.com> -fj-y-saito <85871716+fj-y-saito@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> github-actions[bot] gliptic -gn64 goerch grahameth <96447521+grahameth@users.noreply.github.com> -gtygo gwjr <502526+gwjr@users.noreply.github.com> h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> hankcs -haopeng <657407891@qq.com> -hipudding hoangmit hongbo.mo <352280764@qq.com> hopkins385 <98618192+hopkins385@users.noreply.github.com> @@ -872,16 +649,12 @@ hxer7963 hydai iSma iacore <74560659+iacore@users.noreply.github.com> -icppWorld <124377669+icppWorld@users.noreply.github.com> igarnier intelmatt <61025942+intelmatt@users.noreply.github.com> iohub -issixx <46835150+issixx@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> -jdomke <28772296+jdomke@users.noreply.github.com> -jiahao su jiez <373447296@qq.com> jneem joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> @@ -894,7 +667,6 @@ junchao-loongson <68935141+junchao-loongson@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com> k.h.lai kaizau -kallewoof kalomaze <66376113+kalomaze@users.noreply.github.com> kang katsu560 <118887472+katsu560@users.noreply.github.com> @@ -902,46 +674,32 @@ kchro3 <62481661+kchro3@users.noreply.github.com> khimaros kiltyj klosax <131523366+klosax@users.noreply.github.com> -krystiancha kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunnis kuronekosaiko -kustaaya <58045274+kustaaya@users.noreply.github.com> kuvaus <22169537+kuvaus@users.noreply.github.com> kwin1412 <42286931+kwin1412@users.noreply.github.com> l3utterfly -laik ldwang le.chang leejet -leo-pony -lexasub -lhez limitedAtonement liuwei-git <14815172+liuwei-git@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com> loonerin <132926317+loonerin@users.noreply.github.com> -ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> -mahorozte <41834471+mahorozte@users.noreply.github.com> makomk manikbhandari maor-ps <154728172+maor-ps@users.noreply.github.com> -mashdragon <122402293+mashdragon@users.noreply.github.com> -matiaslin <45382001+matiaslin@users.noreply.github.com> -matt23654 -matteo mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> minarchist mj-shifu <77107165+mj-shifu@users.noreply.github.com> mmyjona momonga <115213907+mmnga@users.noreply.github.com> -momonga <146910567+mmngays@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> -musoles <135031143+musoles@users.noreply.github.com> mzcu nanahi <130121847+na-na-hi@users.noreply.github.com> ngc92 <7938269+ngc92@users.noreply.github.com> @@ -958,21 +716,16 @@ omahs <73983677+omahs@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> -pculliton -peidaqi pengxin99 perserk -piDack <104877312+piDack@users.noreply.github.com> pmysl postmasters pudepiedj qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> -qingy1337 qouoq qunash rabidcopy rankaiyx -redbeard rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> rhuddleston rimoliga <53384203+rimoliga@users.noreply.github.com> @@ -980,7 +733,6 @@ runfuture sandyiscool sasha0552 semidark -serhii-nakon <57632032+serhii-nakon@users.noreply.github.com> sharpHL <132747147+sharpHL@users.noreply.github.com> shibe2 singularity <12184989+singularity-s0@users.noreply.github.com> @@ -989,59 +741,42 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com> slaren snadampal <87143774+snadampal@users.noreply.github.com> -someone13574 <81528246+someone13574@users.noreply.github.com> -standby24x7 staviq stduhpf strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> swittk takov751 <40316768+takov751@users.noreply.github.com> tarcey -tc-mb <157115220+tc-mb@users.noreply.github.com> texmex76 <40733439+texmex76@users.noreply.github.com> thement <40525767+thement@users.noreply.github.com> -thewh1teagle <61390950+thewh1teagle@users.noreply.github.com> tjohnman -toyer <2042519524@qq.com> tslmy ubik2 uint256_t uint256_t unbounded -uvos -uvos valiray <133289098+valiray@users.noreply.github.com> -vb vik viric vodkaslime <646329483@qq.com> vvhg1 <94630311+vvhg1@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com> -wangshuai09 <391746016@qq.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com> woachk <24752637+woachk@users.noreply.github.com> wonjun Jang woodx <124784234+woodx9@users.noreply.github.com> -wwoodsTM <104587230+wwoodsTM@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com> xaedes xaedes -xctan xloem <0xloem@gmail.com> yangli2 -ymcki <84055651+ymcki@users.noreply.github.com> yuiseki -yuri@FreeBSD zakkor zhangkaihuo -zhentaoyu zhouwg <6889919+zhouwg@users.noreply.github.com> zhouwg zrm Ștefan-Gabriel Muscalu -杨朱 · Kiki 源文雨 <41315874+fumiama@users.noreply.github.com> -蕭澧邦 <45505768+shou692199@users.noreply.github.com> -谢乃闻 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b2a1845e..793709122 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,6 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(LLAMA_STANDALONE ON) @@ -47,13 +46,6 @@ if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) endif() -if (MSVC) - add_compile_options("$<$:/utf-8>") - add_compile_options("$<$:/utf-8>") - add_compile_options("$<$:/bigobj>") - add_compile_options("$<$:/bigobj>") -endif() - # # option list # @@ -70,9 +62,6 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) -# utils -option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) - # extra artifacts option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -80,23 +69,24 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) -option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) # override ggml options -set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) -set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) +set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) +set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) +set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) +set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) +set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) # change the default for these ggml options if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE_DEFAULT ON) + set(GGML_LLAMAFILE ON) endif() -if (NOT DEFINED GGML_CUDA_GRAPHS) - set(GGML_CUDA_GRAPHS_DEFAULT ON) +if (NOT DEFINED GGML_CUDA_USE_GRAPHS) + set(GGML_CUDA_USE_GRAPHS ON) endif() # transition helpers @@ -118,62 +108,16 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) -if (NOT MSVC) - if (LLAMA_SANITIZE_THREAD) - message(STATUS "Using -fsanitize=thread") - - add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) - endif() - - if (LLAMA_SANITIZE_ADDRESS) - message(STATUS "Using -fsanitize=address") - - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) - endif() - - if (LLAMA_SANITIZE_UNDEFINED) - message(STATUS "Using -fsanitize=undefined") - - add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) - endif() -endif() - # -# 3rd-party +# build the library # if (NOT TARGET ggml) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() - -# -# build the library -# - add_subdirectory(src) -# -# utils, programs, examples and tests -# - -if (LLAMA_BUILD_COMMON) - add_subdirectory(common) -endif() - -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) - include(CTest) - add_subdirectory(tests) -endif() - -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) - add_subdirectory(examples) - add_subdirectory(pocs) -endif() - # # install # @@ -189,14 +133,18 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") -set(LLAMA_PUBLIC_HEADERS - ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h - ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) -set_target_properties(llama - PROPERTIES - PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") +# At the moment some compile definitions are placed within the ggml/src +# directory but not exported on the `ggml` target. This could be improved by +# determining _precisely_ which defines are necessary for the llama-config +# package. +# +get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS) +get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) +set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) +get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) +set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) install(TARGETS llama LIBRARY PUBLIC_HEADER) configure_package_config_file( @@ -233,4 +181,20 @@ configure_file(cmake/llama.pc.in @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + DESTINATION lib/pkgconfig) + +# +# programs, examples and tests +# + +add_subdirectory(common) + +if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) + include(CTest) + add_subdirectory(tests) +endif () + +if (LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) + add_subdirectory(pocs) +endif() diff --git a/CMakePresets.json b/CMakePresets.json index 13bdd7907..bdad38952 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -24,24 +24,15 @@ "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, - { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, - { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, - { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, - { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, - { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, - { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, - - { - "name": "x64-windows-llvm", "hidden": true, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake" - } - }, + { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, + { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, + { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "arm64-windows-msvc", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x64", "strategy": "external" }, + "architecture": { "value": "arm64", "strategy": "external" }, + "toolset": { "value": "host=x86_64", "strategy": "external" }, "cacheVariables": { "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" } @@ -49,49 +40,26 @@ { "name": "arm64-windows-llvm", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x64", "strategy": "external" }, + "architecture": { "value": "arm64", "strategy": "external" }, + "toolset": { "value": "host=x86_64", "strategy": "external" }, "cacheVariables": { "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" } }, - { - "name": "arm64-apple-clang", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x64", "strategy": "external" }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake" - } - }, + { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, + { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, + { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, - { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, - { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, - { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, - - { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] }, - { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] }, - { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] }, - - { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, + { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, - { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] }, - { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] }, - { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] }, - { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] }, - - { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] }, + { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, - { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] }, - { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] }, - { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }, - { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }, - - { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] }, - { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] } + { "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, + { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] } ] } diff --git a/CODEOWNERS b/CODEOWNERS deleted file mode 100644 index 72d594b46..000000000 --- a/CODEOWNERS +++ /dev/null @@ -1,11 +0,0 @@ -# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs - -/ci/ @ggerganov -/.devops/*.Dockerfile @ngxson -/examples/server/ @ngxson -/ggml/src/ggml-cuda/fattn* @JohannesGaessler -/ggml/src/ggml-cuda/mmq.* @JohannesGaessler -/ggml/src/ggml-cuda/mmv.* @JohannesGaessler -/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler -/ggml/src/ggml-opt.cpp @JohannesGaessler -/ggml/src/gguf.cpp @JohannesGaessler diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d411982b..b688f78ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,125 +1,28 @@ # Pull requests (for contributors) - Test your changes: - - Execute [the full CI locally on your machine](ci/README.md) before publishing - - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`) - - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends) - - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops` -- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly + - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library + - Execute [the full CI locally on your machine](ci/README.md) before publishing +- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. + - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience - If your PR becomes stale, don't hesitate to ping the maintainers in the comments # Pull requests (for collaborators) - Squash-merge PRs - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` -- Optionally pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules -- Consider adding yourself to [CODEOWNERS](CODEOWNERS) +- Optionally, pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules # Coding guidelines - Avoid adding third-party dependencies, extra files, extra headers, etc. - Always consider cross-compatibility with other operating systems and architectures -- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple -- Vertical alignment makes things more readable and easier to batch edit +- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple +- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` -- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets -- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo` - - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary - ```cpp - // OK - llama_context * ctx; - const llama_rope_type rope_type; - - // not OK - struct llama_context * ctx; - const enum llama_rope_type rope_type; - ``` - - _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_ - -- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code -- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines) +- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ ![matmul](media/matmul.png) -# Naming guidelines - -- Use `snake_case` for function, variable and type names -- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) - - ```cpp - // not OK - int small_number; - int big_number; - - // OK - int number_small; - int number_big; - ``` - -- Enum values are always in upper case and prefixed with the enum name - - ```cpp - enum llama_vocab_type { - LLAMA_VOCAB_TYPE_NONE = 0, - LLAMA_VOCAB_TYPE_SPM = 1, - LLAMA_VOCAB_TYPE_BPE = 2, - LLAMA_VOCAB_TYPE_WPM = 3, - LLAMA_VOCAB_TYPE_UGM = 4, - LLAMA_VOCAB_TYPE_RWKV = 5, - }; - ``` - -- The general naming pattern is `_`, with `` being `_` - - ```cpp - llama_model_init(); // class: "llama_model", method: "init" - llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove" - llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed" - llama_set_embeddings(); // class: "llama_context", method: "set_embeddings" - llama_n_threads(); // class: "llama_context", method: "n_threads" - llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free" - ``` - - - The `get` `` can be omitted - - The `` can be omitted if not necessary - - The `_context` suffix of the `` is optional. Use it to disambiguate symbols when needed - - Use `init`/`free` for constructor/destructor `` - -- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else - - ```cpp - typedef struct llama_context * llama_context_t; - - enum llama_pooling_type llama_pooling_type(const llama_context_t ctx); - ``` - - _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_ - -- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension -- Python filenames are all lowercase with underscores - -- _(TODO: abbreviations usage)_ - -# Preprocessor directives - -- _(TODO: add guidelines with examples and apply them to the codebase)_ - - ```cpp - #ifdef FOO - #endif // FOO - ``` - -# Documentation - -- Documentation is a community effort -- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference -- When you notice incorrect or outdated documentation, please update it - -# Resources - -The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: - -https://github.com/ggerganov/llama.cpp/projects diff --git a/Makefile b/Makefile index dc3de3cb1..58a93db1a 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,17 @@ -ifndef LLAMA_MAKEFILE -$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) -endif - # Define the default target now so that it is always the first target BUILD_TARGETS = \ libllava.a \ + llama-baby-llama \ llama-batched \ llama-batched-bench \ llama-bench \ + llama-benchmark-matmult \ llama-cli \ llama-convert-llama2c-to-ggml \ llama-embedding \ llama-eval-callback \ llama-export-lora \ + llama-finetune \ llama-gbnf-validator \ llama-gguf \ llama-gguf-hash \ @@ -21,8 +20,6 @@ BUILD_TARGETS = \ llama-imatrix \ llama-infill \ llama-llava-cli \ - llama-minicpmv-cli\ - llama-qwen2vl-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -38,29 +35,26 @@ BUILD_TARGETS = \ llama-save-load-state \ llama-server \ llama-simple \ - llama-simple-chat \ - llama-run \ llama-speculative \ llama-tokenize \ + llama-train-text-from-scratch \ llama-vdot \ llama-cvector-generator \ - llama-gen-docs \ tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ - tests/test-arg-parser \ tests/test-autorelease \ tests/test-backend-ops \ - tests/test-chat \ tests/test-chat-template \ tests/test-double-float \ + tests/test-grad0 \ tests/test-grammar-integration \ tests/test-grammar-parser \ tests/test-json-schema-to-grammar \ tests/test-llama-grammar \ - tests/test-log \ tests/test-model-load-cancel \ + tests/test-opt \ tests/test-quantize-fns \ tests/test-quantize-perf \ tests/test-rope \ @@ -68,16 +62,15 @@ TEST_TARGETS = \ tests/test-tokenizer-0 \ tests/test-tokenizer-1-bpe \ tests/test-tokenizer-1-spm -# tests/test-opt \ # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ +LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm + retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. -LEGACY_TARGETS_BUILD = main quantize perplexity embedding server +LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune # Deprecation aliases ifdef LLAMA_CUBLAS @@ -99,6 +92,11 @@ GGML_METAL := 1 DEPRECATE_WARNING := 1 endif +ifdef LLAMA_OPENMP +GGML_OPENMP := 1 +DEPRECATE_WARNING := 1 +endif + ifdef LLAMA_RPC GGML_RPC := 1 DEPRECATE_WARNING := 1 @@ -149,14 +147,6 @@ GGML_NO_METAL := 1 DEPRECATE_WARNING := 1 endif -ifdef LLAMA_DISABLE_LOGS -REMOVE_WARNING := 1 -endif - -ifdef LLAMA_SERVER_VERBOSE -REMOVE_WARNING := 1 -endif - ifndef UNAME_S UNAME_S := $(shell uname -s) endif @@ -257,11 +247,11 @@ endif # Compile flags # -# keep standard at C11 and C++17 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU +# keep standard at C11 and C++11 +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++17 -fPIC -MK_NVCCFLAGS = -std=c++17 +MK_CXXFLAGS = -std=c++11 -fPIC +MK_NVCCFLAGS = -std=c++11 ifdef LLAMA_NO_CCACHE GGML_NO_CCACHE := 1 @@ -297,7 +287,6 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE - MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, @@ -338,9 +327,9 @@ ifdef LLAMA_DEBUG endif else MK_CPPFLAGS += -DNDEBUG - MK_CFLAGS += -O3 -g - MK_CXXFLAGS += -O3 -g - MK_NVCCFLAGS += -O3 -g + MK_CFLAGS += -O3 + MK_CXXFLAGS += -O3 + MK_NVCCFLAGS += -O3 endif ifdef LLAMA_SANITIZE_THREAD @@ -361,14 +350,18 @@ ifdef LLAMA_SANITIZE_UNDEFINED MK_LDFLAGS += -fsanitize=undefined -g endif +ifdef LLAMA_SERVER_VERBOSE + MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) +endif + ifdef LLAMA_SERVER_SSL MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT MK_LDFLAGS += -lssl -lcrypto endif -ifndef GGML_NO_CPU_AARCH64 - MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 -endif +ifdef LLAMA_DISABLE_LOGS + MK_CPPFLAGS += -DLOG_DISABLE_LOGS +endif # LLAMA_DISABLE_LOGS # warnings WARN_FLAGS = \ @@ -440,17 +433,13 @@ endif # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue -ifndef RISCV_CROSS_COMPILE +ifndef RISCV ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: MK_CFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native - # Usage AMX build test - #MK_CFLAGS += -march=graniterapids -mtune=graniterapids - #HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids - # Usage AVX-only #MK_CFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx @@ -524,12 +513,7 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),) MK_CXXFLAGS += -mlasx endif -ifneq ($(filter riscv64%,$(UNAME_M)),) - MK_CFLAGS += -march=rv64gcv -mabi=lp64d - MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d -endif - -else # RISC-V CROSS COMPILATION +else MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif @@ -538,11 +522,11 @@ ifndef GGML_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE - MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK - MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 - MK_LDFLAGS += -framework Accelerate - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS + MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK + MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 + MK_LDFLAGS += -framework Accelerate + OBJ_GGML += ggml/src/ggml-blas.o endif endif # GGML_NO_ACCELERATE @@ -553,50 +537,42 @@ ifndef GGML_NO_OPENMP endif # GGML_NO_OPENMP ifdef GGML_OPENBLAS - MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) - MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) - MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) + MK_LDFLAGS += $(shell pkg-config --libs openblas) + OBJ_GGML += ggml/src/ggml-blas.o endif # GGML_OPENBLAS ifdef GGML_OPENBLAS64 - MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) - MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) - MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) + MK_LDFLAGS += $(shell pkg-config --libs openblas64) + OBJ_GGML += ggml/src/ggml-blas.o endif # GGML_OPENBLAS64 ifdef GGML_BLIS - MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis - MK_LDFLAGS += -lblis -L/usr/local/lib - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis + MK_LDFLAGS += -lblis -L/usr/local/lib + OBJ_GGML += ggml/src/ggml-blas.o endif # GGML_BLIS ifdef GGML_NVPL - MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas - MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas + MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp + OBJ_GGML += ggml/src/ggml-blas.o endif # GGML_NVPL ifndef GGML_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o + MK_CPPFLAGS += -DGGML_USE_LLAMAFILE + OBJ_GGML += ggml/src/llamafile/sgemm.o endif -ifndef GGML_NO_AMX - MK_CPPFLAGS += -DGGML_USE_AMX - OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o -endif - -# only necessary for the CPU backend files -MK_CPPFLAGS += -Iggml/src/ggml-cpu - ifdef GGML_RPC - MK_CPPFLAGS += -DGGML_USE_RPC - OBJ_GGML_EXT += ggml/src/ggml-rpc.o + MK_CPPFLAGS += -DGGML_USE_RPC + OBJ_GGML += ggml/src/ggml-rpc.o endif # GGML_RPC -OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu)) +OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu)) ifdef GGML_CUDA_FA_ALL_QUANTS @@ -614,13 +590,13 @@ ifdef GGML_CUDA CUDA_PATH ?= /usr/local/cuda endif - MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_NVCCFLAGS += -use_fast_math - OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) + OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings @@ -642,7 +618,7 @@ ifdef GGML_CUDA_NVCC NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else NVCC = $(CCACHE) nvcc -endif # GGML_CUDA_NVCC +endif #GGML_CUDA_NVCC ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) @@ -650,6 +626,10 @@ else ifndef CUDA_POWER_ARCH MK_NVCCFLAGS += -arch=native endif # CUDA_DOCKER_ARCH +ifdef GGML_CUDA_FORCE_DMMV + MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # GGML_CUDA_FORCE_DMMV + ifdef GGML_CUDA_FORCE_MMQ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ endif # GGML_CUDA_FORCE_MMQ @@ -658,6 +638,20 @@ ifdef GGML_CUDA_FORCE_CUBLAS MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS endif # GGML_CUDA_FORCE_CUBLAS +ifdef GGML_CUDA_DMMV_X + MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) +else + MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 +endif # GGML_CUDA_DMMV_X + +ifdef GGML_CUDA_MMV_Y + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) +else ifdef GGML_CUDA_DMMV_Y + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility +else + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 +endif # GGML_CUDA_MMV_Y + ifdef GGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_F16 @@ -666,6 +660,12 @@ ifdef GGML_CUDA_DMMV_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_DMMV_F16 +ifdef GGML_CUDA_KQUANTS_ITER + MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) +else + MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 +endif + ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) else @@ -701,8 +701,8 @@ ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/common.cuh $(NVCC_COMPILE) -ggml/src/ggml-cuda/ggml-cuda.o: \ - ggml/src/ggml-cuda/ggml-cuda.cu \ +ggml/src/ggml-cuda.o: \ + ggml/src/ggml-cuda.cu \ ggml/include/ggml-cuda.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h \ @@ -713,9 +713,9 @@ ggml/src/ggml-cuda/ggml-cuda.o: \ endif # GGML_CUDA ifdef GGML_VULKAN - MK_CPPFLAGS += -DGGML_USE_VULKAN - MK_LDFLAGS += $(shell pkg-config --libs vulkan) - OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o + MK_CPPFLAGS += -DGGML_USE_VULKAN + MK_LDFLAGS += $(shell pkg-config --libs vulkan) + OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o ifdef GGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS @@ -729,10 +729,6 @@ ifdef GGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG endif -ifdef GGML_VULKAN_PERF - MK_CPPFLAGS += -DGGML_VULKAN_PERF -endif - ifdef GGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE endif @@ -745,10 +741,10 @@ GLSLC_CMD = glslc _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp -_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders +_ggml_vk_input_dir = ggml/src/vulkan-shaders _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) -ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) +ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ $(_ggml_vk_header): $(_ggml_vk_source) @@ -760,12 +756,12 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen --target-hpp $(_ggml_vk_header) \ --target-cpp $(_ggml_vk_source) -vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp - $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp + $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp endif # GGML_VULKAN -ifdef GGML_HIP +ifdef GGML_HIPBLAS ifeq ($(wildcard /opt/rocm),) ROCM_PATH ?= /usr AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) @@ -774,7 +770,11 @@ ifdef GGML_HIP AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif - MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA + GGML_CUDA_DMMV_X ?= 32 + GGML_CUDA_MMV_Y ?= 1 + GGML_CUDA_KQUANTS_ITER ?= 2 + + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA ifdef GGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA @@ -787,6 +787,13 @@ endif # GGML_HIP_UMA HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) + HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) + HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) + HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) + +ifdef GGML_CUDA_FORCE_DMMV + HIPFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # GGML_CUDA_FORCE_DMMV ifdef GGML_CUDA_FORCE_MMQ HIPFLAGS += -DGGML_CUDA_FORCE_MMQ @@ -800,12 +807,12 @@ ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY - OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) + OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) -ggml/src/ggml-cuda/ggml-cuda.o: \ - ggml/src/ggml-cuda/ggml-cuda.cu \ +ggml/src/ggml-cuda.o: \ + ggml/src/ggml-cuda.cu \ ggml/include/ggml-cuda.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h \ @@ -820,173 +827,70 @@ ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-common.h \ ggml/src/ggml-cuda/common.cuh $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< -endif # GGML_HIP - -ifdef GGML_MUSA - ifeq ($(wildcard /opt/musa),) - MUSA_PATH ?= /usr/local/musa - else - MUSA_PATH ?= /opt/musa - endif - MUSA_ARCHITECTURES ?= 21;22 - - MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA - MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib - MK_LDFLAGS += -lmusa -lmusart -lmublas - - ifndef GGML_NO_OPENMP - # For Ubuntu Focal - MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp - MK_LDFLAGS += -L/usr/lib/llvm-10/lib - # For Ubuntu Jammy - MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include - MK_LDFLAGS += -L/usr/lib/llvm-14/lib - endif # GGML_NO_OPENMP - - CC := $(MUSA_PATH)/bin/clang - CXX := $(MUSA_PATH)/bin/clang++ - MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc - - MUSAFLAGS = -x musa -mtgpu - MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch)) - -ifdef GGML_CUDA_FORCE_MMQ - MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ -endif # GGML_CUDA_FORCE_MMQ - -ifdef GGML_CUDA_FORCE_CUBLAS - MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS -endif # GGML_CUDA_FORCE_CUBLAS - -ifdef GGML_CUDA_F16 - MUSAFLAGS += -DGGML_CUDA_F16 -endif # GGML_CUDA_F16 - -ifdef GGML_CUDA_DMMV_F16 - MUSAFLAGS += -DGGML_CUDA_F16 -endif # GGML_CUDA_DMMV_F16 - -ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE - MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) -else - MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -endif # GGML_CUDA_PEER_MAX_BATCH_SIZE - -ifdef GGML_CUDA_NO_PEER_COPY - MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # GGML_CUDA_NO_PEER_COPY - -ifdef GGML_CUDA_FA_ALL_QUANTS - MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS -endif # GGML_CUDA_FA_ALL_QUANTS - - OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) - -ggml/src/ggml-cuda/ggml-cuda.o: \ - ggml/src/ggml-cuda/ggml-cuda.cu \ - ggml/include/ggml-cuda.h \ - ggml/include/ggml.h \ - ggml/include/ggml-backend.h \ - ggml/src/ggml-backend-impl.h \ - ggml/src/ggml-common.h \ - $(wildcard ggml/src/ggml-cuda/*.cuh) - $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $< - -ggml/src/ggml-cuda/%.o: \ - ggml/src/ggml-cuda/%.cu \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h \ - ggml/src/ggml-cuda/common.cuh - $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $< -endif # GGML_MUSA +endif # GGML_HIPBLAS ifdef GGML_METAL - MK_CPPFLAGS += -DGGML_USE_METAL - MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o - -ifdef GGML_METAL_USE_BF16 - MK_CPPFLAGS += -DGGML_METAL_USE_BF16 -endif # GGML_METAL_USE_BF16 + MK_CPPFLAGS += -DGGML_USE_METAL + MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit + OBJ_GGML += ggml/src/ggml-metal.o ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif ifdef GGML_METAL_EMBED_LIBRARY - MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY - OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o + MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY + OBJ_GGML += ggml/src/ggml-metal-embed.o endif endif # GGML_METAL ifdef GGML_METAL -ggml/src/ggml-metal/ggml-metal.o: \ - ggml/src/ggml-metal/ggml-metal.m \ - ggml/src/ggml-metal/ggml-metal-impl.h \ +ggml/src/ggml-metal.o: \ + ggml/src/ggml-metal.m \ ggml/include/ggml-metal.h \ ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ ifdef GGML_METAL_EMBED_LIBRARY ggml/src/ggml-metal-embed.o: \ - ggml/src/ggml-metal/ggml-metal.metal \ - ggml/src/ggml-metal/ggml-metal-impl.h \ + ggml/src/ggml-metal.metal \ ggml/src/ggml-common.h @echo "Embedding Metal library" - @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp - @sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal - $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ - @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s - @rmdir ${TEMP_ASSEMBLY} + @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal + $(eval TEMP_ASSEMBLY=$(shell mktemp)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @$(AS) $(TEMP_ASSEMBLY) -o $@ + @rm -f ${TEMP_ASSEMBLY} endif endif # GGML_METAL -DIR_GGML = ggml -DIR_LLAMA = src -DIR_COMMON = common - -OBJ_GGML = \ - $(DIR_GGML)/src/ggml.o \ - $(DIR_GGML)/src/ggml-alloc.o \ - $(DIR_GGML)/src/ggml-backend.o \ - $(DIR_GGML)/src/ggml-backend-reg.o \ - $(DIR_GGML)/src/ggml-opt.o \ - $(DIR_GGML)/src/ggml-quants.o \ - $(DIR_GGML)/src/ggml-threading.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \ - $(OBJ_GGML_EXT) +OBJ_GGML += \ + ggml/src/ggml.o \ + ggml/src/ggml-alloc.o \ + ggml/src/ggml-backend.o \ + ggml/src/ggml-quants.o \ + ggml/src/ggml-aarch64.o OBJ_LLAMA = \ - $(DIR_LLAMA)/llama.o \ - $(DIR_LLAMA)/llama-vocab.o \ - $(DIR_LLAMA)/llama-grammar.o \ - $(DIR_LLAMA)/llama-sampling.o \ - $(DIR_LLAMA)/unicode.o \ - $(DIR_LLAMA)/unicode-data.o + src/llama.o \ + src/llama-vocab.o \ + src/llama-grammar.o \ + src/llama-sampling.o \ + src/unicode.o \ + src/unicode-data.o OBJ_COMMON = \ - $(DIR_COMMON)/common.o \ - $(DIR_COMMON)/arg.o \ - $(DIR_COMMON)/log.o \ - $(DIR_COMMON)/console.o \ - $(DIR_COMMON)/ngram-cache.o \ - $(DIR_COMMON)/sampling.o \ - $(DIR_COMMON)/speculative.o \ - $(DIR_COMMON)/chat.o \ - $(DIR_COMMON)/build-info.o \ - $(DIR_COMMON)/json-schema-to-grammar.o + common/common.o \ + common/console.o \ + common/ngram-cache.o \ + common/sampling.o \ + common/train.o \ + common/grammar-parser.o \ + common/build-info.o \ + common/json-schema-to-grammar.o OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) @@ -1075,90 +979,206 @@ $(info - LLAMA_NO_CCACHE) $(info ) endif -ifdef REMOVE_WARNING -$(info !!! REMOVAL WARNING !!!) -$(info The following LLAMA_ options have been removed and are no longer supported) -$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) -$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) -$(info ) -endif - # # Build libraries # -# Libraries -LIB_GGML = libggml.so -LIB_GGML_S = libggml.a +# ggml -LIB_LLAMA = libllama.so -LIB_LLAMA_S = libllama.a +ggml/src/ggml.o: \ + ggml/src/ggml.c \ + ggml/include/ggml.h + $(CC) $(CFLAGS) -c $< -o $@ -LIB_COMMON = libcommon.so -LIB_COMMON_S = libcommon.a +ggml/src/ggml-alloc.o: \ + ggml/src/ggml-alloc.c \ + ggml/include/ggml.h \ + ggml/include/ggml-alloc.h + $(CC) $(CFLAGS) -c $< -o $@ -# Targets -BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S) +ggml/src/ggml-backend.o: \ + ggml/src/ggml-backend.c \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h + $(CC) $(CFLAGS) -c $< -o $@ -# Dependency files -DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) +ggml/src/ggml-quants.o: \ + ggml/src/ggml-quants.c \ + ggml/include/ggml.h \ + ggml/src/ggml-quants.h \ + ggml/src/ggml-common.h + $(CC) $(CFLAGS) -c $< -o $@ -# Default target -all: $(BUILD_TARGETS) +ggml/src/ggml-aarch64.o: \ + ggml/src/ggml-aarch64.c \ + ggml/include/ggml.h \ + ggml/src/ggml-aarch64.h \ + ggml/src/ggml-common.h + $(CC) $(CFLAGS) -c $< -o $@ -# force c++ build for source file that have same name as c file -# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files -$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ +ggml/src/ggml-blas.o: \ + ggml/src/ggml-blas.cpp \ + ggml/include/ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -# Rules for building object files -$(DIR_GGML)/%.o: $(DIR_GGML)/%.c - $(CC) $(CFLAGS) -MMD -c $< -o $@ +ifndef GGML_NO_LLAMAFILE +ggml/src/llamafile/sgemm.o: \ + ggml/src/llamafile/sgemm.cpp \ + ggml/src/llamafile/sgemm.h \ + ggml/include/ggml.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # GGML_NO_LLAMAFILE -$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ +ifdef GGML_RPC +ggml/src/ggml-rpc.o: \ + ggml/src/ggml-rpc.cpp \ + ggml/include/ggml-rpc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # GGML_RPC -$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ - -$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ - -# Rules for building libraries -$(LIB_GGML): $(OBJ_GGML) +$(LIB_GGML): \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_GGML_S): $(OBJ_GGML) +$(LIB_GGML_S): \ + $(OBJ_GGML) ar rcs $(LIB_GGML_S) $^ -$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML) +# llama + +src/unicode.o: \ + src/unicode.cpp \ + src/unicode.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/unicode-data.o: \ + src/unicode-data.cpp \ + src/unicode-data.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama.o: \ + src/llama.cpp \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-grammar.h \ + src/llama-sampling.h \ + src/unicode.h \ + include/llama.h \ + ggml/include/ggml-cuda.h \ + ggml/include/ggml-metal.h \ + ggml/include/ggml.h \ + ggml/include/ggml-alloc.h \ + ggml/include/ggml-backend.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-vocab.o: \ + src/llama-vocab.cpp \ + src/llama-vocab.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-grammar.o: \ + src/llama-grammar.cpp \ + src/llama-grammar.h \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-sampling.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-sampling.o: \ + src/llama-sampling.cpp \ + src/llama-sampling.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +$(LIB_LLAMA): \ + $(OBJ_LLAMA) \ + $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_LLAMA_S): $(OBJ_LLAMA) +$(LIB_LLAMA_S): \ + $(OBJ_LLAMA) ar rcs $(LIB_LLAMA_S) $^ -$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML) +# common + +common/common.o: \ + common/common.cpp \ + common/common.h \ + common/console.h \ + common/sampling.h \ + common/json.hpp \ + common/json-schema-to-grammar.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/sampling.o: \ + common/sampling.cpp \ + common/sampling.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/console.o: \ + common/console.cpp \ + common/console.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/grammar-parser.o: \ + common/grammar-parser.cpp \ + common/grammar-parser.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/json-schema-to-grammar.o: \ + common/json-schema-to-grammar.cpp \ + common/json-schema-to-grammar.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/train.o: \ + common/train.cpp \ + common/train.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/ngram-cache.o: \ + common/ngram-cache.cpp \ + common/ngram-cache.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +$(LIB_COMMON): \ + $(OBJ_COMMON) \ + $(LIB_LLAMA) \ + $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_COMMON_S): $(OBJ_COMMON) +$(LIB_COMMON_S): \ + $(OBJ_COMMON) ar rcs $(LIB_COMMON_S) $^ -# Include dependency files --include $(DEP_FILES) - -# Clean generated server assets -clean-server-assets: - find examples/server -type f -name "*.js.hpp" -delete - find examples/server -type f -name "*.mjs.hpp" -delete - find examples/server -type f -name "*.css.hpp" -delete - find examples/server -type f -name "*.html.hpp" -delete - -# Clean rule -clean: clean-server-assets - rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS) - rm -rvf *.a *.dll *.so *.dot - find ggml src common tests examples pocs -type f -name "*.o" -delete - find ggml src common tests examples pocs -type f -name "*.d" -delete +clean: + rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS) + rm -rvf src/*.o + rm -rvf tests/*.o + rm -rvf examples/*.o + rm -rvf common/*.o + rm -rvf *.a + rm -rvf *.dll + rm -rvf *.so + rm -rvf *.dot + rm -rvf ggml/*.a + rm -rvf ggml/*.dll + rm -rvf ggml/*.so + rm -vrf ggml/src/*.o + rm -rvf common/build-info.cpp + rm -vrf ggml/src/ggml-metal-embed.metal + rm -vrf ggml/src/ggml-cuda/*.o + rm -vrf ggml/src/ggml-cuda/template-instances/*.o + rm -rvf $(BUILD_TARGETS) + rm -rvf $(TEST_TARGETS) + rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp + rm -rvf $(LEGACY_TARGETS_CLEAN) + find examples pocs -type f -name "*.o" -delete # # Examples @@ -1184,21 +1204,11 @@ llama-infill: examples/infill/infill.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-run: examples/run/run.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-simple: examples/simple/simple.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-simple-chat: examples/simple-chat/simple-chat.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-tokenize: examples/tokenize/tokenize.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1286,16 +1296,31 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ +llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ + $(OBJ_GGML) $(OBJ_LLAMA) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + llama-bench: examples/llama-bench/llama-bench.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-baby-llama: examples/baby-llama/baby-llama.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-finetune: examples/finetune/finetune.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + llama-export-lora: examples/export-lora/export-lora.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1361,19 +1386,29 @@ llama-server: \ examples/server/server.cpp \ examples/server/utils.hpp \ examples/server/httplib.h \ + examples/server/colorthemes.css.hpp \ + examples/server/style.css.hpp \ + examples/server/theme-beeninorder.css.hpp \ + examples/server/theme-ketivah.css.hpp \ + examples/server/theme-mangotango.css.hpp \ + examples/server/theme-playground.css.hpp \ + examples/server/theme-polarnight.css.hpp \ + examples/server/theme-snowstorm.css.hpp \ examples/server/index.html.hpp \ - examples/server/loading.html.hpp \ - common/chat.cpp \ - common/chat.hpp \ - common/chat-template.hpp \ + examples/server/index-new.html.hpp \ + examples/server/index.js.hpp \ + examples/server/completion.js.hpp \ + examples/server/system-prompts.js.hpp \ + examples/server/prompt-formats.js.hpp \ + examples/server/json-schema-to-grammar.mjs.hpp \ common/json.hpp \ - common/minja.hpp \ + common/stb_image.h \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: -examples/server/%.hpp: examples/server/public/% FORCE Makefile +examples/server/%.hpp: examples/server/public/% Makefile @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ echo "unsigned char $${NAME}[] = {" && \ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ @@ -1381,11 +1416,6 @@ examples/server/%.hpp: examples/server/public/% FORCE Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ -llama-gen-docs: examples/gen-docs/gen-docs.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - libllava.a: examples/llava/llava.cpp \ examples/llava/llava.h \ examples/llava/clip.cpp \ @@ -1396,28 +1426,15 @@ libllava.a: examples/llava/llava.cpp \ $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual llama-llava-cli: examples/llava/llava-cli.cpp \ - examples/llava/llava.cpp \ - examples/llava/llava.h \ - examples/llava/clip.cpp \ examples/llava/clip.h \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual - -llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ - examples/llava/llava.cpp \ - examples/llava/llava.h \ examples/llava/clip.cpp \ - examples/llava/clip.h \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual - -llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \ - examples/llava/llava.cpp \ examples/llava/llava.h \ - examples/llava/clip.cpp \ - examples/llava/clip.h \ + examples/llava/llava.cpp \ $(OBJ_ALL) - $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift @@ -1441,21 +1458,21 @@ common/build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -tests/test-arg-parser: tests/test-arg-parser.cpp \ - $(OBJ_ALL) +llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \ + $(OBJ_GGML) common/build-info.o $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +run-benchmark-matmult: llama-benchmark-matmult + ./$@ + +.PHONY: run-benchmark-matmult swift + tests/test-llama-grammar: tests/test-llama-grammar.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-log: tests/test-log.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - tests/test-grammar-parser: tests/test-grammar-parser.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1475,9 +1492,9 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-chat: tests/test-chat.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) +tests/test-grad0: tests/test-grad0.cpp \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-opt: tests/test-opt.cpp \ @@ -1561,45 +1578,56 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # # Mark legacy binary targets as .PHONY so that they are always checked. -.PHONY: FORCE main quantize perplexity embedding server - -# Define the object file target -examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $@ +.PHONY: main quantize perplexity embedding server finetune # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. -main: examples/deprecation-warning/deprecation-warning.o - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) +main: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." -server: examples/deprecation-warning/deprecation-warning.o - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) +server: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." -quantize: examples/deprecation-warning/deprecation-warning.o +quantize: examples/deprecation-warning/deprecation-warning.cpp ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." @echo " Remove the 'quantize' binary to remove this warning." @echo "#########" endif -perplexity: examples/deprecation-warning/deprecation-warning.o +perplexity: examples/deprecation-warning/deprecation-warning.cpp ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." @echo " Remove the 'perplexity' binary to remove this warning." @echo "#########" endif -embedding: examples/deprecation-warning/deprecation-warning.o +embedding: examples/deprecation-warning/deprecation-warning.cpp ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." @echo " Remove the 'embedding' binary to remove this warning." @echo "#########" endif + +finetune: examples/deprecation-warning/deprecation-warning.cpp +ifneq (,$(wildcard finetune)) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + @echo "#########" + @echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead." + @echo " Remove the 'finetune' binary to remove this warning." + @echo "#########" +endif diff --git a/Package.swift b/Package.swift index 01c996d24..1d90b47bf 100644 --- a/Package.swift +++ b/Package.swift @@ -2,6 +2,48 @@ import PackageDescription +var sources = [ + "src/llama.cpp", + "src/llama-vocab.cpp", + "src/llama-grammar.cpp", + "src/llama-sampling.cpp", + "src/unicode.cpp", + "src/unicode-data.cpp", + "ggml/src/ggml.c", + "ggml/src/ggml-alloc.c", + "ggml/src/ggml-backend.c", + "ggml/src/ggml-quants.c", + "ggml/src/ggml-aarch64.c", +] + +var resources: [Resource] = [] +var linkerSettings: [LinkerSetting] = [] +var cSettings: [CSetting] = [ + .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), + .unsafeFlags(["-fno-objc-arc"]), + // NOTE: NEW_LAPACK will required iOS version 16.4+ + // We should consider add this in the future when we drop support for iOS 14 + // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) + // .define("ACCELERATE_NEW_LAPACK"), + // .define("ACCELERATE_LAPACK_ILP64") +] + +#if canImport(Darwin) +sources.append("ggml/src/ggml-metal.m") +resources.append(.process("ggml/src/ggml-metal.metal")) +linkerSettings.append(.linkedFramework("Accelerate")) +cSettings.append( + contentsOf: [ + .define("GGML_USE_ACCELERATE"), + .define("GGML_USE_METAL") + ] +) +#endif + +#if os(Linux) + cSettings.append(.define("_GNU_SOURCE")) +#endif + let package = Package( name: "llama", platforms: [ @@ -14,6 +56,24 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], targets: [ - .systemLibrary(name: "llama", pkgConfig: "llama"), - ] + .target( + name: "llama", + path: ".", + exclude: [ + "cmake", + "examples", + "scripts", + "models", + "tests", + "CMakeLists.txt", + "Makefile" + ], + sources: sources, + resources: resources, + publicHeadersPath: "spm-headers", + cSettings: cSettings, + linkerSettings: linkerSettings + ) + ], + cxxLanguageStandard: .cxx11 ) diff --git a/README.md b/README.md index 11f3d0286..7c233b5e1 100644 --- a/README.md +++ b/README.md @@ -4,52 +4,62 @@ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ +> [!IMPORTANT] +[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809) + ## Recent API changes -- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289) -- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291) +- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006 +- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 +- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 +- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 +- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 +- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 +- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 +- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 ## Hot topics -- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427 -- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode -- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639 -- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim -- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123 -- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669 -- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430 +- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021 +- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920 +- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387 +- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 +- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 +- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017 +- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 +- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 +- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 ---- ## Description The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide -range of hardware - locally and in the cloud. +variety of hardware - locally and in the cloud. - Plain C/C++ implementation without any dependencies - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks -- AVX, AVX2, AVX512 and AMX support for x86 architectures +- AVX, AVX2 and AVX512 support for x86 architectures - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use -- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA) +- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. +Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has +improved significantly thanks to many contributions. It is the main playground for developing new features for the +[ggml](https://github.com/ggerganov/ggml) library. -
-Models +**Supported models:** Typically finetunes of the base models below are supported as well. -Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md) - -#### Text-only - - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 - [x] LLaMA 3 🦙🦙🦙 @@ -73,7 +83,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) -- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003) - [x] [GPT-2](https://huggingface.co/gpt2) - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) @@ -86,27 +95,12 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) -- [x] [OLMo 2](https://allenai.org/olmo) -- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) -- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) -- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) -- [x] [Smaug](https://huggingface.co/models?search=Smaug) -- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B) -- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM) -- [x] [Flan T5](https://huggingface.co/models?search=flan-t5) -- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) -- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat) -- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) -- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) -- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) -- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) -- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) -- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) -- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1) -- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) +- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) -#### Multimodal +(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) + +**Multimodal models:** - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) @@ -117,396 +111,352 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) -- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge) -- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d) -
- -
-Bindings +**Bindings:** - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) -- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli) - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) -- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) -- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) -- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) -- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) -- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) -
+**UI:** -
-UIs +Unless otherwise noted these projects are open-source with permissive licensing: + +- [iohub/collama](https://github.com/iohub/coLLaMA) +- [janhq/jan](https://github.com/janhq/jan) (AGPL) +- [nat/openplayground](https://github.com/nat/openplayground) +- [Faraday](https://faraday.dev/) (proprietary) +- [LMStudio](https://lmstudio.ai/) (proprietary) +- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) +- [LocalAI](https://github.com/mudler/LocalAI) (MIT) +- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) +- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) +- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) +- [ollama/ollama](https://github.com/ollama/ollama) +- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) +- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) +- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) +- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) +- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) +- [RAGNA Desktop](https://ragna.app/) (proprietary) +- [RecurseChat](https://recurse.chat/) (proprietary) +- [semperai/amica](https://github.com/semperai/amica) +- [withcatai/catai](https://github.com/withcatai/catai) +- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Msty](https://msty.app) (proprietary) +- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) +- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) +- [Dot](https://github.com/alexpinel/Dot) (GPL) +- [MindMac](https://mindmac.app) (proprietary) +- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) +- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) +- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) +- [AIKit](https://github.com/sozercan/aikit) (MIT) +- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* -- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) -- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) -- [Dot](https://github.com/alexpinel/Dot) (GPL) -- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) -- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0) -- [janhq/jan](https://github.com/janhq/jan) (AGPL) -- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0) -- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) -- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT) -- [LARS](https://github.com/abgulati/LARS) (AGPL) -- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) -- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) -- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) -- [LMStudio](https://lmstudio.ai/) (proprietary) -- [LocalAI](https://github.com/mudler/LocalAI) (MIT) -- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) -- [MindMac](https://mindmac.app) (proprietary) -- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) -- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) -- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0) -- [nat/openplayground](https://github.com/nat/openplayground) (MIT) -- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT) -- [ollama/ollama](https://github.com/ollama/ollama) (MIT) -- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) -- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT) -- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT) -- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT) -- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) -- [ramalama](https://github.com/containers/ramalama) (MIT) -- [semperai/amica](https://github.com/semperai/amica) (MIT) -- [withcatai/catai](https://github.com/withcatai/catai) (MIT) -- [Autopen](https://github.com/blackhole89/autopen) (GPL) - -
- -
-Tools +**Tools:** - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML -- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption -- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage -- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example) -
- -
-Infrastructure +**Infrastructure:** - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp -- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs -- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly -- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server -- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale + +## Demo + +
+Typical run using LLaMA v2 13B on M2 Ultra + +``` +$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e +I llama.cpp build info: +I UNAME_S: Darwin +I UNAME_P: arm +I UNAME_M: arm64 +I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE +I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS +I LDFLAGS: -framework Accelerate +I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1) +I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1) + +make: Nothing to be done for `default'. +main: build = 1041 (cf658ad) +main: seed = 1692823051 +llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest)) +llama_model_loader: - type f32: 81 tensors +llama_model_loader: - type q4_0: 281 tensors +llama_model_loader: - type q6_K: 1 tensors +llm_load_print_meta: format = GGUF V1 (latest) +llm_load_print_meta: arch = llama +llm_load_print_meta: vocab type = SPM +llm_load_print_meta: n_vocab = 32000 +llm_load_print_meta: n_merges = 0 +llm_load_print_meta: n_ctx_train = 4096 +llm_load_print_meta: n_ctx = 512 +llm_load_print_meta: n_embd = 5120 +llm_load_print_meta: n_head = 40 +llm_load_print_meta: n_head_kv = 40 +llm_load_print_meta: n_layer = 40 +llm_load_print_meta: n_rot = 128 +llm_load_print_meta: n_gqa = 1 +llm_load_print_meta: f_norm_eps = 1.0e-05 +llm_load_print_meta: f_norm_rms_eps = 1.0e-05 +llm_load_print_meta: n_ff = 13824 +llm_load_print_meta: freq_base = 10000.0 +llm_load_print_meta: freq_scale = 1 +llm_load_print_meta: model type = 13B +llm_load_print_meta: model ftype = mostly Q4_0 +llm_load_print_meta: model size = 13.02 B +llm_load_print_meta: general.name = LLaMA v2 +llm_load_print_meta: BOS token = 1 '' +llm_load_print_meta: EOS token = 2 '' +llm_load_print_meta: UNK token = 0 '' +llm_load_print_meta: LF token = 13 '<0x0A>' +llm_load_tensors: ggml ctx size = 0.11 MB +llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state) +................................................................................................... +llama_new_context_with_model: kv self size = 400.00 MB +llama_new_context_with_model: compute buffer total size = 75.41 MB + +system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | +sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000 +generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0 + + + Building a website can be done in 10 simple steps: +Step 1: Find the right website platform. +Step 2: Choose your domain name and hosting plan. +Step 3: Design your website layout. +Step 4: Write your website content and add images. +Step 5: Install security features to protect your site from hackers or spammers +Step 6: Test your website on multiple browsers, mobile devices, operating systems etc… +Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine! +Step 8: Start marketing and promoting the website via social media channels or paid ads +Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc… +Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further! +How does a Website Work? +A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable! +The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking. +How to +llama_print_timings: load time = 576.45 ms +llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second) +llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second) +llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second) +llama_print_timings: total time = 25431.49 ms +```
-Games +Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook -- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. +And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook: + +https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
+## Usage + +Here are the end-to-end binary build and model conversion steps for most supported models. + +### Basic usage + +Firstly, you need to get the binary. There are different methods that you can follow: +- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md) +- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md) +- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md) +- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases) + +You can run a basic completion using this command: + +```bash +llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128 + +# Output: +# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey. +``` + +See [this page](./examples/main/README.md) for a full list of parameters. + +### Conversation mode + +If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter: + +```bash +llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv + +# Output: +# > hi, who are you? +# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today? +# +# > what is 1+1? +# Easy peasy! The answer to 1+1 is... 2! +``` + +By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) + +```bash +./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml +``` + +You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters: + +```bash +./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' +``` + +### Web server + +[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. + +Example usage: + +```bash +./llama-server -m your_model.gguf --port 8080 + +# Basic web UI can be accessed via browser: http://localhost:8080 +# Chat completion endpoint: http://localhost:8080/v1/chat/completions +``` + +### Interactive mode + +> [!NOTE] +> If you prefer basic usage, please consider using conversation mode instead of interactive mode + +In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. + +Here is an example of a few-shot interaction, invoked with the command + +```bash +# default arguments using a 7B model +./examples/chat.sh + +# advanced chat with a 13B model +./examples/chat-13B.sh + +# custom arguments using a 13B model +./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt +``` + +Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program. + +![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) + +### Persistent Interaction + +The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. + +```bash +# Start a new chat +PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh + +# Resume that chat +PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh + +# Start a different chat with the same prompt/model +PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh + +# Different prompt cache for different prompt/model +PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ + CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh +``` + +### Constrained output with grammars + +`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: + +```bash +./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' +``` + +The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). + +For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. + +## Build + +Please refer to [Build llama.cpp locally](./docs/build.md) + ## Supported backends | Backend | Target devices | | --- | --- | -| [Metal](docs/build.md#metal-build) | Apple Silicon | -| [BLAS](docs/build.md#blas-build) | All | -| [BLIS](docs/backend/BLIS.md) | All | -| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | -| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU | -| [CUDA](docs/build.md#cuda) | Nvidia GPU | -| [HIP](docs/build.md#hip) | AMD GPU | -| [Vulkan](docs/build.md#vulkan) | GPU | -| [CANN](docs/build.md#cann) | Ascend NPU | +| [Metal](./docs/build.md#metal-build) | Apple Silicon | +| [BLAS](./docs/build.md#blas-build) | All | +| [BLIS](./docs/backend/BLIS.md) | All | +| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [CUDA](./docs/build.md#cuda) | Nvidia GPU | +| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | +| [Vulkan](./docs/build.md#vulkan) | GPU | -## Building the project +## Tools -The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h). -The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries: +### Prepare and Quantize -- Clone this repository and build locally, see [how to build](docs/build.md) -- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md) -- Use a Docker image, see [documentation for Docker](docs/docker.md) -- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases) +> [!NOTE] +> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours. -## Obtaining and quantizing models +To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. -The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: +Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. +It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face. -- [Trending](https://huggingface.co/models?library=gguf&sort=trending) -- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) +To learn more about quantizing model, [read this documentation](./examples/quantize/README.md) -You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf /[:quant]` +### Perplexity (measuring model quality) -After downloading a model, use the CLI tools to run it locally - see below. - -`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. - -The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`: - -- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes -- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123) -- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268) -- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669) - -To learn more about model quantization, [read this documentation](examples/quantize/README.md) - -## [`llama-cli`](examples/main) - -#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality. - --
- Run in conversation mode - - Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME` - - ```bash - llama-cli -m model.gguf - - # > hi, who are you? - # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today? - # - # > what is 1+1? - # Easy peasy! The answer to 1+1 is... 2! - ``` - -
- --
- Run in conversation mode with custom chat template - - ```bash - # use the "chatml" template (use -h to see the list of supported templates) - llama-cli -m model.gguf -cnv --chat-template chatml - - # use a custom template - llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:' - ``` - -
- --
- Run simple text completion - - To disable conversation mode explicitly, use `-no-cnv` - - ```bash - llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv - - # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey. - ``` - -
- --
- Constrain the output with a custom grammar - - ```bash - llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' - - # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"} - ``` - - The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md). - - For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/ - -
- - -## [`llama-server`](examples/server) - -#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs. - --
- Start a local HTTP server with default configuration on port 8080 - - ```bash - llama-server -m model.gguf --port 8080 - - # Basic web UI can be accessed via browser: http://localhost:8080 - # Chat completion endpoint: http://localhost:8080/v1/chat/completions - ``` - -
- --
- Support multiple-users and parallel decoding - - ```bash - # up to 4 concurrent requests, each with 4096 max context - llama-server -m model.gguf -c 16384 -np 4 - ``` - -
- --
- Enable speculative decoding - - ```bash - # the draft.gguf model should be a small variant of the target model.gguf - llama-server -m model.gguf -md draft.gguf - ``` - -
- --
- Serve an embedding model - - ```bash - # use the /embedding endpoint - llama-server -m model.gguf --embedding --pooling cls -ub 8192 - ``` - -
- --
- Serve a reranking model - - ```bash - # use the /reranking endpoint - llama-server -m model.gguf --reranking - ``` - -
- --
- Constrain all outputs with a grammar - - ```bash - # custom grammar - llama-server -m model.gguf --grammar-file grammar.gbnf - - # JSON - llama-server -m model.gguf --grammar-file grammars/json.gbnf - ``` - -
- - -## [`llama-perplexity`](examples/perplexity) - -#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text. - --
- Measure the perplexity over a text file - - ```bash - llama-perplexity -m model.gguf -f file.txt - - # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ... - # Final estimate: PPL = 5.4007 +/- 0.67339 - ``` - -
- --
- Measure KL divergence - - ```bash - # TODO - ``` - -
- -[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md) -[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) - -## [`llama-bench`](examples/llama-bench) - -#### Benchmark the performance of the inference for various parameters. - --
- Run default benchmark - - ```bash - llama-bench -m model.gguf - - # Output: - # | model | size | params | backend | threads | test | t/s | - # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: | - # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 ± 20.55 | - # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 ± 0.81 | - # - # build: 3e0ba0e60 (4229) - ``` - -
- -## [`llama-run`](examples/run) - -#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3]. - --
- Run a model with a specific prompt (by default it's pulled from Ollama registry) - - ```bash - llama-run granite-code - ``` - -
- -[^3]: [RamaLama](https://github.com/containers/ramalama) - -## [`llama-simple`](examples/simple) - -#### A minimal example for implementing apps with `llama.cpp`. Useful for developers. - --
- Basic text completion - - ```bash - llama-simple -m model.gguf - - # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of - ``` - -
+You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). +For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). +To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md) ## Contributing - Contributors can open PRs - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions -- Any help with managing issues, PRs and projects is very appreciated! +- Any help with managing issues and PRs is very appreciated! - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) -## Other documentation +## Other documentations -- [main (cli)](examples/main/README.md) -- [server](examples/server/README.md) -- [GBNF grammars](grammars/README.md) +- [main (cli)](./examples/main/README.md) +- [server](./examples/server/README.md) +- [jeopardy](./examples/jeopardy/README.md) +- [GBNF grammars](./grammars/README.md) -#### Development documentation +**Development documentations** -- [How to build](docs/build.md) -- [Running on Docker](docs/docker.md) -- [Build on Android](docs/android.md) -- [Performance troubleshooting](docs/development/token_generation_performance_tips.md) +- [How to build](./docs/build.md) +- [Running on Docker](./docs/docker.md) +- [Build on Android](./docs/android.md) +- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md) - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) -#### Seminal papers and background on the models +**Seminal papers and background on the models** If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: - LLaMA: @@ -517,6 +467,3 @@ If your issue is with model generation quality, then please at least scan the fo - GPT-3.5 / InstructGPT / ChatGPT: - [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) - -#### References - diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h deleted file mode 100644 index 41725880e..000000000 --- a/Sources/llama/llama.h +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -#include - diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap deleted file mode 100644 index d010555b1..000000000 --- a/Sources/llama/module.modulemap +++ /dev/null @@ -1,5 +0,0 @@ -module llama [system] { - header "llama.h" - link "llama" - export * -} diff --git a/ci/run.sh b/ci/run.sh index 77c32ce00..58022c7dc 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#/bin/bash # # sample usage: # @@ -13,9 +13,6 @@ # # with SYCL support # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # -# # with VULKAN support -# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt -# if [ -z "$2" ]; then echo "usage: $0 " @@ -39,11 +36,11 @@ SRC=`pwd` CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1" fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -53,11 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then exit 1 fi - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" -fi - -if [ ! -z ${GG_BUILD_VULKAN} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" fi ## helpers @@ -114,7 +107,7 @@ function gg_run_ctest_debug { gg_check_build_requirements (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log @@ -145,7 +138,7 @@ function gg_run_ctest_release { gg_check_build_requirements (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log if [ -z ${GG_BUILD_LOW_PERF} ]; then (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log @@ -273,6 +266,7 @@ function gg_sum_ctest_with_model_release { } # open_llama_7b_v2 +# requires: GG_BUILD_CUDA function gg_run_open_llama_7b_v2 { cd ${SRC} @@ -296,8 +290,8 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf @@ -326,36 +320,36 @@ function gg_run_open_llama_7b_v2 { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -431,7 +425,7 @@ function gg_run_pythia_1_4b { set -e (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf @@ -460,34 +454,34 @@ function gg_run_pythia_1_4b { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -541,6 +535,7 @@ function gg_sum_pythia_1_4b { } # pythia_2_8b +# requires: GG_BUILD_CUDA function gg_run_pythia_2_8b { cd ${SRC} @@ -561,8 +556,8 @@ function gg_run_pythia_2_8b { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf @@ -591,36 +586,36 @@ function gg_run_pythia_2_8b { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -697,7 +692,7 @@ function gg_run_embd_bge_small { set -e (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf @@ -706,88 +701,12 @@ function gg_run_embd_bge_small { ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log set +e } -function gg_sum_embd_bge_small { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'BGE Small (BERT):\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" - gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" -} - -# rerank_tiny - -function gg_run_rerank_tiny { - cd ${SRC} - - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json - gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json - - gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json - - path_models="../models-mnt/rerank-tiny" - - rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release - - set -e - - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log - - python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf - - model_f16="${path_models}/ggml-model-f16.gguf" - - # for this model, the SEP token is "" - (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log - - # sample output - # rerank score 0: 0.029 - # rerank score 1: 0.029 - # rerank score 2: 0.135 - - # check that the score is in the range [$3, $4] - function check_score { - qnt="$1" - score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) - - if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then - printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4" - return 20 - fi - - printf ' - %s @ %s OK\n' "$qnt" "$score" - return 0 - } - - check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log - check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log - check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log - - set +e -} - -function gg_sum_rerank_tiny { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'Rerank Tiny (Jina):\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)" -} - function gg_check_build_requirements { if ! command -v cmake &> /dev/null; then gg_printf 'cmake not found, please install' @@ -802,10 +721,16 @@ function gg_check_build_requirements { fi } -## main +function gg_sum_embd_bge_small { + gg_printf '### %s\n\n' "${ci}" -export LLAMA_LOG_PREFIX=1 -export LLAMA_LOG_TIMESTAMPS=1 + gg_printf 'BGE Small (BERT):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" +} + +## main if [ -z ${GG_BUILD_LOW_PERF} ]; then # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt @@ -815,10 +740,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then ln -sfn ${mnt_models} ${SRC}/models-mnt # Create a fresh python3 venv and enter it - if ! python3 -m venv "$MNT/venv"; then - echo "Error: Failed to create Python virtual environment at $MNT/venv." - exit 1 - fi + python3 -m venv "$MNT/venv" source "$MNT/venv/bin/activate" pip install -r ${SRC}/requirements.txt --disable-pip-version-check @@ -832,7 +754,6 @@ test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run embd_bge_small - test $ret -eq 0 && gg_run rerank_tiny if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then test $ret -eq 0 && gg_run test_scripts_debug @@ -840,7 +761,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then fi if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then - if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then + if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run pythia_1_4b else test $ret -eq 0 && gg_run pythia_2_8b diff --git a/cmake/arm64-apple-clang.cmake b/cmake/arm64-apple-clang.cmake deleted file mode 100644 index 5fcd2882a..000000000 --- a/cmake/arm64-apple-clang.cmake +++ /dev/null @@ -1,16 +0,0 @@ -set( CMAKE_SYSTEM_NAME Darwin ) -set( CMAKE_SYSTEM_PROCESSOR arm64 ) - -set( target arm64-apple-darwin-macho ) - -set( CMAKE_C_COMPILER clang ) -set( CMAKE_CXX_COMPILER clang++ ) - -set( CMAKE_C_COMPILER_TARGET ${target} ) -set( CMAKE_CXX_COMPILER_TARGET ${target} ) - -set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" ) -set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" ) - -set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) -set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) diff --git a/cmake/build-info.cmake b/cmake/build-info.cmake index c1a456e17..ea3dc55c8 100644 --- a/cmake/build-info.cmake +++ b/cmake/build-info.cmake @@ -44,7 +44,7 @@ if(MSVC) set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME}) else() execute_process( - COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER} + COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER} OUTPUT_VARIABLE OUT OUTPUT_STRIP_TRAILING_WHITESPACE ) diff --git a/cmake/common.cmake b/cmake/common.cmake deleted file mode 100644 index 0f54871e4..000000000 --- a/cmake/common.cmake +++ /dev/null @@ -1,33 +0,0 @@ -function(llama_add_compile_flags) - if (LLAMA_FATAL_WARNINGS) - if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND C_FLAGS -Werror) - list(APPEND CXX_FLAGS -Werror) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - add_compile_options(/WX) - endif() - endif() - - if (LLAMA_ALL_WARNINGS) - if (NOT MSVC) - list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - - list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - - list(APPEND C_FLAGS ${WARNING_FLAGS}) - list(APPEND CXX_FLAGS ${WARNING_FLAGS}) - - ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) - - add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" - "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") - else() - # todo : msvc - set(C_FLAGS "" PARENT_SCOPE) - set(CXX_FLAGS "" PARENT_SCOPE) - endif() - endif() -endfunction() diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 90cbec5b6..f072b76a3 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,28 +3,88 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(GGML_BLAS @GGML_BLAS@) +set(GGML_CUDA @GGML_CUDA@) +set(GGML_METAL @GGML_METAL@) +set(GGML_HIPBLAS @GGML_HIPBLAS@) +set(GGML_ACCELERATE @GGML_ACCELERATE@) +set(GGML_VULKAN @GGML_VULKAN@) +set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) +set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) +set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) +set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) +set(GGML_SYCL @GGML_SYCL@) +set(GGML_OPENMP @GGML_OPENMP@) + @PACKAGE_INIT@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") -find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake) +# Ensure transient dependencies satisfied + +find_package(Threads REQUIRED) + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) +endif() + +if (GGML_BLAS) + find_package(BLAS REQUIRED) +endif() + +if (GGML_CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) +endif() + +if (GGML_VULKAN) + find_package(Vulkan REQUIRED) +endif() + +if (GGML_HIPBLAS) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) +endif() + +if (GGML_SYCL) + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) +endif() + +if (GGML_OPENMP) + find_package(OpenMP REQUIRED) +endif() + + +find_library(ggml_LIBRARY ggml + REQUIRED + HINTS ${LLAMA_LIB_DIR}) find_library(llama_LIBRARY llama REQUIRED - HINTS ${LLAMA_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH -) + HINTS ${LLAMA_LIB_DIR}) + +set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") +set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") add_library(llama UNKNOWN IMPORTED) + set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;" + INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LOCATION "${llama_LIBRARY}" - INTERFACE_COMPILE_FEATURES c_std_90 - POSITION_INDEPENDENT_CODE ON) + INTERFACE_COMPILE_FEATURES cxx_std_11 + POSITION_INDEPENDENT_CODE ON ) check_required_components(Llama) diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in index 6fb58b5f6..326acbb61 100644 --- a/cmake/llama.pc.in +++ b/cmake/llama.pc.in @@ -1,10 +1,10 @@ prefix=@CMAKE_INSTALL_PREFIX@ -exec_prefix=@CMAKE_INSTALL_PREFIX@ -libdir=@CMAKE_INSTALL_FULL_LIBDIR@ -includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +exec_prefix=${prefix} +libdir=${exec_prefix}/lib +includedir=${prefix}/include Name: llama Description: Port of Facebook's LLaMA model in C/C++ -Version: @LLAMA_INSTALL_VERSION@ -Libs: -L${libdir} -lggml -lggml-base -lllama +Version: @PROJECT_VERSION@ +Libs: -L${libdir} -lllama Cflags: -I${includedir} diff --git a/cmake/x64-windows-llvm.cmake b/cmake/x64-windows-llvm.cmake deleted file mode 100644 index 0603d738f..000000000 --- a/cmake/x64-windows-llvm.cmake +++ /dev/null @@ -1,11 +0,0 @@ -set( CMAKE_SYSTEM_NAME Windows ) -set( CMAKE_SYSTEM_PROCESSOR x86_64 ) - -set( CMAKE_C_COMPILER clang ) -set( CMAKE_CXX_COMPILER clang++ ) - -set( arch_c_flags "-march=native" ) - -set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" ) -set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" ) - diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index e61015d2a..761971d68 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -2,8 +2,6 @@ find_package(Threads REQUIRED) -llama_add_compile_flags() - # Build info header # @@ -53,28 +51,21 @@ endif() set(TARGET common) add_library(${TARGET} STATIC - arg.cpp - arg.h base64.hpp - chat.cpp - chat.hpp - chat-template.hpp - common.cpp common.h - console.cpp - console.h - json-schema-to-grammar.cpp - json.hpp - llguidance.cpp - log.cpp - log.h - minja.hpp - ngram-cache.cpp - ngram-cache.h - sampling.cpp + common.cpp sampling.h - speculative.cpp - speculative.h + sampling.cpp + console.h + console.cpp + grammar-parser.h + grammar-parser.cpp + json.hpp + json-schema-to-grammar.cpp + train.h + train.cpp + ngram-cache.h + ngram-cache.cpp ) if (BUILD_SHARED_LIBS) @@ -86,39 +77,12 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info) # Use curl to download model url if (LLAMA_CURL) find_package(CURL REQUIRED) - target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) + add_definitions(-DLLAMA_USE_CURL) include_directories(${CURL_INCLUDE_DIRS}) find_library(CURL_LIBRARY curl REQUIRED) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) endif () -if (LLAMA_LLGUIDANCE) - include(ExternalProject) - set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source) - set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release) - ExternalProject_Add(llguidance_ext - GIT_REPOSITORY https://github.com/guidance-ai/llguidance - # v0.6.12: - GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09 - PREFIX ${CMAKE_BINARY_DIR}/llguidance - SOURCE_DIR ${LLGUIDANCE_SRC} - BUILD_IN_SOURCE TRUE - CONFIGURE_COMMAND "" - BUILD_COMMAND cargo build --release - INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h - UPDATE_COMMAND "" - ) - target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE) - - add_library(llguidance STATIC IMPORTED) - set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a) - add_dependencies(llguidance llguidance_ext) - - target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH}) - set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance) -endif () - target_include_directories(${TARGET} PUBLIC .) -target_compile_features (${TARGET} PUBLIC cxx_std_17) +target_compile_features (${TARGET} PUBLIC cxx_std_11) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/common/arg.cpp b/common/arg.cpp deleted file mode 100644 index 152f671ab..000000000 --- a/common/arg.cpp +++ /dev/null @@ -1,2370 +0,0 @@ -#include "arg.h" - -#include "log.h" -#include "sampling.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "json-schema-to-grammar.h" - -using json = nlohmann::ordered_json; - -common_arg & common_arg::set_examples(std::initializer_list examples) { - this->examples = std::move(examples); - return *this; -} - -common_arg & common_arg::set_excludes(std::initializer_list excludes) { - this->excludes = std::move(excludes); - return *this; -} - -common_arg & common_arg::set_env(const char * env) { - help = help + "\n(env: " + env + ")"; - this->env = env; - return *this; -} - -common_arg & common_arg::set_sparam() { - is_sparam = true; - return *this; -} - -bool common_arg::in_example(enum llama_example ex) { - return examples.find(ex) != examples.end(); -} - -bool common_arg::is_exclude(enum llama_example ex) { - return excludes.find(ex) != excludes.end(); -} - -bool common_arg::get_value_from_env(std::string & output) { - if (env == nullptr) return false; - char * value = std::getenv(env); - if (value) { - output = value; - return true; - } - return false; -} - -bool common_arg::has_value_from_env() { - return env != nullptr && std::getenv(env); -} - -static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { - std::vector result; - std::istringstream iss(input); - std::string line; - auto add_line = [&](const std::string& l) { - if (l.length() <= max_char_per_line) { - result.push_back(l); - } else { - std::istringstream line_stream(l); - std::string word, current_line; - while (line_stream >> word) { - if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { - if (!current_line.empty()) result.push_back(current_line); - current_line = word; - } else { - current_line += (!current_line.empty() ? " " : "") + word; - } - } - if (!current_line.empty()) result.push_back(current_line); - } - }; - while (std::getline(iss, line)) { - add_line(line); - } - return result; -} - -std::string common_arg::to_string() { - // params for printing to console - const static int n_leading_spaces = 40; - const static int n_char_per_line_help = 70; // TODO: detect this based on current console - std::string leading_spaces(n_leading_spaces, ' '); - - std::ostringstream ss; - for (const auto arg : args) { - if (arg == args.front()) { - if (args.size() == 1) { - ss << arg; - } else { - // first arg is usually abbreviation, we need padding to make it more beautiful - auto tmp = std::string(arg) + ", "; - auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' '); - ss << tmp << spaces; - } - } else { - ss << arg << (arg != args.back() ? ", " : ""); - } - } - if (value_hint) ss << " " << value_hint; - if (value_hint_2) ss << " " << value_hint_2; - if (ss.tellp() > n_leading_spaces - 3) { - // current line is too long, add new line - ss << "\n" << leading_spaces; - } else { - // padding between arg and help, same line - ss << std::string(leading_spaces.size() - ss.tellp(), ' '); - } - const auto help_lines = break_str_into_lines(help, n_char_per_line_help); - for (const auto & line : help_lines) { - ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; - } - return ss.str(); -} - -// -// utils -// - -static void common_params_handle_model_default( - std::string & model, - const std::string & model_url, - std::string & hf_repo, - std::string & hf_file, - const std::string & hf_token, - const std::string & model_default) { - if (!hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (hf_file.empty()) { - if (model.empty()) { - auto auto_detected = common_get_hf_file(hf_repo, hf_token); - if (auto_detected.first.empty() || auto_detected.second.empty()) { - exit(1); // built without CURL, error message already printed - } - hf_repo = auto_detected.first; - hf_file = auto_detected.second; - } else { - hf_file = model; - } - } - // make sure model path is present (for caching purposes) - if (model.empty()) { - // this is to avoid different repo having same file name, or same file name in different subdirs - std::string filename = hf_repo + "_" + hf_file; - // to make sure we don't have any slashes in the filename - string_replace_all(filename, "/", "_"); - model = fs_get_cache_file(filename); - } - } else if (!model_url.empty()) { - if (model.empty()) { - auto f = string_split(model_url, '#').front(); - f = string_split(f, '?').front(); - model = fs_get_cache_file(string_split(f, '/').back()); - } - } else if (model.empty()) { - model = model_default; - } -} - -const std::vector kv_cache_types = { - GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_Q8_0, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_IQ4_NL, - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, -}; - -static ggml_type kv_cache_type_from_str(const std::string & s) { - for (const auto & type : kv_cache_types) { - if (ggml_type_name(type) == s) { - return type; - } - } - throw std::runtime_error("Unsupported cache type: " + s); -} - -static std::string get_all_kv_cache_types() { - std::ostringstream msg; - for (const auto & type : kv_cache_types) { - msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", "); - } - return msg.str(); -} - -// -// CLI argument parsing functions -// - -static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { - std::string arg; - const std::string arg_prefix = "--"; - common_params & params = ctx_arg.params; - - std::unordered_map arg_to_options; - for (auto & opt : ctx_arg.options) { - for (const auto & arg : opt.args) { - arg_to_options[arg] = &opt; - } - } - - // handle environment variables - for (auto & opt : ctx_arg.options) { - std::string value; - if (opt.get_value_from_env(value)) { - try { - if (opt.handler_void && (value == "1" || value == "true")) { - opt.handler_void(params); - } - if (opt.handler_int) { - opt.handler_int(params, std::stoi(value)); - } - if (opt.handler_string) { - opt.handler_string(params, value); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(string_format( - "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); - } - } - } - - // handle command line arguments - auto check_arg = [&](int i) { - if (i+1 >= argc) { - throw std::invalid_argument("expected value for argument"); - } - }; - - for (int i = 1; i < argc; i++) { - const std::string arg_prefix = "--"; - - std::string arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); - } - auto opt = *arg_to_options[arg]; - if (opt.has_value_from_env()) { - fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); - } - try { - if (opt.handler_void) { - opt.handler_void(params); - continue; - } - - // arg with single value - check_arg(i); - std::string val = argv[++i]; - if (opt.handler_int) { - opt.handler_int(params, std::stoi(val)); - continue; - } - if (opt.handler_string) { - opt.handler_string(params, val); - continue; - } - - // arg with 2 values - check_arg(i); - std::string val2 = argv[++i]; - if (opt.handler_str_str) { - opt.handler_str_str(params, val, val2); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(string_format( - "error while handling argument \"%s\": %s\n\n" - "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); - } - } - - postprocess_cpu_params(params.cpuparams, nullptr); - postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - - postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch); - - if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { - throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); - } - - // TODO: refactor model params in a common struct - common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH); - common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, ""); - common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, ""); - - if (params.escape) { - string_process_escapes(params.prompt); - string_process_escapes(params.input_prefix); - string_process_escapes(params.input_suffix); - for (auto & antiprompt : params.antiprompt) { - string_process_escapes(antiprompt); - } - for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { - string_process_escapes(seq_breaker); - } - } - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - if (params.reranking && params.embedding) { - throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); - } - - if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { - throw std::runtime_error(string_format( - "error: the supplied chat template is not supported: %s%s\n", - params.chat_template.c_str(), - params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates" - )); - } - - return true; -} - -static void common_params_print_usage(common_params_context & ctx_arg) { - auto print_options = [](std::vector & options) { - for (common_arg * opt : options) { - printf("%s", opt->to_string().c_str()); - } - }; - - std::vector common_options; - std::vector sparam_options; - std::vector specific_options; - for (auto & opt : ctx_arg.options) { - // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example - if (opt.is_sparam) { - sparam_options.push_back(&opt); - } else if (opt.in_example(ctx_arg.ex)) { - specific_options.push_back(&opt); - } else { - common_options.push_back(&opt); - } - } - printf("----- common params -----\n\n"); - print_options(common_options); - printf("\n\n----- sampling params -----\n\n"); - print_options(sparam_options); - // TODO: maybe convert enum llama_example to string - printf("\n\n----- example-specific params -----\n\n"); - print_options(specific_options); -} - -static std::vector parse_device_list(const std::string & value) { - std::vector devices; - auto dev_names = string_split(value, ','); - if (dev_names.empty()) { - throw std::invalid_argument("no devices specified"); - } - if (dev_names.size() == 1 && dev_names[0] == "none") { - devices.push_back(nullptr); - } else { - for (const auto & device : dev_names) { - auto * dev = ggml_backend_dev_by_name(device.c_str()); - if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { - throw std::invalid_argument(string_format("invalid device: %s", device.c_str())); - } - devices.push_back(dev); - } - devices.push_back(nullptr); - } - return devices; -} - -static void add_rpc_devices(std::string servers) { - auto rpc_servers = string_split(servers, ','); - if (rpc_servers.empty()) { - throw std::invalid_argument("no RPC servers specified"); - } - ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); - if (!rpc_reg) { - throw std::invalid_argument("failed to find RPC backend"); - } - typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); - ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); - if (!ggml_backend_rpc_add_device_fn) { - throw std::invalid_argument("failed to find RPC device add function"); - } - for (const auto & server : rpc_servers) { - ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); - if (dev) { - ggml_backend_device_register(dev); - } else { - throw std::invalid_argument("failed to register RPC device"); - } - } -} - -bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { - auto ctx_arg = common_params_parser_init(params, ex, print_usage); - const common_params params_org = ctx_arg.params; // the example can modify the default params - - try { - if (!common_params_parse_ex(argc, argv, ctx_arg)) { - ctx_arg.params = params_org; - return false; - } - if (ctx_arg.params.usage) { - common_params_print_usage(ctx_arg); - if (ctx_arg.print_usage) { - ctx_arg.print_usage(argc, argv); - } - exit(0); - } - } catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - ctx_arg.params = params_org; - return false; - } - - return true; -} - -static std::string list_builtin_chat_templates() { - std::vector supported_tmpl; - int32_t res = llama_chat_builtin_templates(nullptr, 0); - supported_tmpl.resize(res); - res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size()); - std::ostringstream msg; - for (auto & tmpl : supported_tmpl) { - msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", "); - } - return msg.str(); -} - -common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { - // load dynamic backends - ggml_backend_load_all(); - - common_params_context ctx_arg(params); - ctx_arg.print_usage = print_usage; - ctx_arg.ex = ex; - - std::string sampler_type_chars; - std::string sampler_type_names; - for (const auto & sampler : params.sampling.samplers) { - sampler_type_chars += common_sampler_type_to_chr(sampler); - sampler_type_names += common_sampler_type_to_str(sampler) + ";"; - } - sampler_type_names.pop_back(); - - - /** - * filter options by example - * rules: - * - all examples inherit options from LLAMA_EXAMPLE_COMMON - * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example - * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example - */ - auto add_opt = [&](common_arg arg) { - if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { - ctx_arg.options.push_back(std::move(arg)); - } - }; - - - add_opt(common_arg( - {"-h", "--help", "--usage"}, - "print usage and exit", - [](common_params & params) { - params.usage = true; - } - )); - add_opt(common_arg( - {"--version"}, - "show version and build info", - [](common_params &) { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - )); - add_opt(common_arg( - {"--verbose-prompt"}, - string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](common_params & params) { - params.verbose_prompt = true; - } - )); - add_opt(common_arg( - {"--no-display-prompt"}, - string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](common_params & params) { - params.display_prompt = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-co", "--color"}, - string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](common_params & params) { - params.use_color = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(common_arg( - {"-t", "--threads"}, "N", - string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](common_params & params, int value) { - params.cpuparams.n_threads = value; - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_env("LLAMA_ARG_THREADS")); - add_opt(common_arg( - {"-tb", "--threads-batch"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads)", - [](common_params & params, int value) { - params.cpuparams_batch.n_threads = value; - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - )); - add_opt(common_arg( - {"-C", "--cpu-mask"}, "M", - "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](common_params & params, const std::string & mask) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - )); - add_opt(common_arg( - {"-Cr", "--cpu-range"}, "lo-hi", - "range of CPUs for affinity. Complements --cpu-mask", - [](common_params & params, const std::string & range) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - )); - add_opt(common_arg( - {"--cpu-strict"}, "<0|1>", - string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](common_params & params, const std::string & value) { - params.cpuparams.strict_cpu = std::stoul(value); - } - )); - add_opt(common_arg( - {"--prio"}, "N", - string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.cpuparams.priority = (enum ggml_sched_priority) prio; - } - )); - add_opt(common_arg( - {"--poll"}, "<0...100>", - string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](common_params & params, const std::string & value) { - params.cpuparams.poll = std::stoul(value); - } - )); - add_opt(common_arg( - {"-Cb", "--cpu-mask-batch"}, "M", - "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - )); - add_opt(common_arg( - {"-Crb", "--cpu-range-batch"}, "lo-hi", - "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](common_params & params, const std::string & range) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - )); - add_opt(common_arg( - {"--cpu-strict-batch"}, "<0|1>", - "use strict CPU placement (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.cpuparams_batch.strict_cpu = value; - } - )); - add_opt(common_arg( - {"--prio-batch"}, "N", - string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - )); - add_opt(common_arg( - {"--poll-batch"}, "<0|1>", - "use polling to wait for work (default: same as --poll)", - [](common_params & params, int value) { - params.cpuparams_batch.poll = value; - } - )); - add_opt(common_arg( - {"-lcs", "--lookup-cache-static"}, "FNAME", - "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](common_params & params, const std::string & value) { - params.lookup_cache_static = value; - } - ).set_examples({LLAMA_EXAMPLE_LOOKUP})); - add_opt(common_arg( - {"-lcd", "--lookup-cache-dynamic"}, "FNAME", - "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](common_params & params, const std::string & value) { - params.lookup_cache_dynamic = value; - } - ).set_examples({LLAMA_EXAMPLE_LOOKUP})); - add_opt(common_arg( - {"-c", "--ctx-size"}, "N", - string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](common_params & params, int value) { - params.n_ctx = value; - } - ).set_env("LLAMA_ARG_CTX_SIZE")); - add_opt(common_arg( - {"-n", "--predict", "--n-predict"}, "N", - string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](common_params & params, int value) { - params.n_predict = value; - } - ).set_env("LLAMA_ARG_N_PREDICT")); - add_opt(common_arg( - {"-b", "--batch-size"}, "N", - string_format("logical maximum batch size (default: %d)", params.n_batch), - [](common_params & params, int value) { - params.n_batch = value; - } - ).set_env("LLAMA_ARG_BATCH")); - add_opt(common_arg( - {"-ub", "--ubatch-size"}, "N", - string_format("physical maximum batch size (default: %d)", params.n_ubatch), - [](common_params & params, int value) { - params.n_ubatch = value; - } - ).set_env("LLAMA_ARG_UBATCH")); - add_opt(common_arg( - {"--keep"}, "N", - string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](common_params & params, int value) { - params.n_keep = value; - } - )); - add_opt(common_arg( - {"--no-context-shift"}, - string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](common_params & params) { - params.ctx_shift = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); - add_opt(common_arg( - {"--chunks"}, "N", - string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](common_params & params, int value) { - params.n_chunks = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(common_arg( - {"-fa", "--flash-attn"}, - string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](common_params & params) { - params.flash_attn = true; - } - ).set_env("LLAMA_ARG_FLASH_ATTN")); - add_opt(common_arg( - {"-p", "--prompt"}, "PROMPT", - ex == LLAMA_EXAMPLE_MAIN - ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" - : "prompt to start generation with", - [](common_params & params, const std::string & value) { - params.prompt = value; - } - ).set_excludes({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--no-perf"}, - string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](common_params & params) { - params.no_perf = true; - params.sampling.no_perf = true; - } - ).set_env("LLAMA_ARG_NO_PERF")); - add_opt(common_arg( - {"-f", "--file"}, "FNAME", - "a file containing the prompt (default: none)", - [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - // store the external file name in params - params.prompt_file = value; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } - ).set_excludes({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--in-file"}, "FNAME", - "an input file (repeat to specify multiple files)", - [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - params.in_files.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(common_arg( - {"-bf", "--binary-file"}, "FNAME", - "binary file containing the prompt (default: none)", - [](common_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - // store the external file name in params - params.prompt_file = value; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); - } - ).set_excludes({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"-e", "--escape"}, - string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](common_params & params) { - params.escape = true; - } - )); - add_opt(common_arg( - {"--no-escape"}, - "do not process escape sequences", - [](common_params & params) { - params.escape = false; - } - )); - add_opt(common_arg( - {"-ptc", "--print-token-count"}, "N", - string_format("print token count every N tokens (default: %d)", params.n_print), - [](common_params & params, int value) { - params.n_print = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"--prompt-cache"}, "FNAME", - "file to cache prompt state for faster startup (default: none)", - [](common_params & params, const std::string & value) { - params.path_prompt_cache = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"--prompt-cache-all"}, - "if specified, saves user input and generations to cache as well\n", - [](common_params & params) { - params.prompt_cache_all = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"--prompt-cache-ro"}, - "if specified, uses the prompt cache but does not update it", - [](common_params & params) { - params.prompt_cache_ro = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-r", "--reverse-prompt"}, "PROMPT", - "halt generation at PROMPT, return control in interactive mode\n", - [](common_params & params, const std::string & value) { - params.antiprompt.emplace_back(value); - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-sp", "--special"}, - string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](common_params & params) { - params.special = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"-cnv", "--conversation"}, - "run in conversation mode:\n" - "- does not print special tokens and suffix/prefix\n" - "- interactive mode is also enabled\n" - "(default: auto enabled if chat template is available)", - [](common_params & params) { - params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-no-cnv", "--no-conversation"}, - "force disable conversation mode (default: false)", - [](common_params & params) { - params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-i", "--interactive"}, - string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](common_params & params) { - params.interactive = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-if", "--interactive-first"}, - string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](common_params & params) { - params.interactive_first = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-mli", "--multiline-input"}, - "allows you to write or paste multiple lines without ending each in '\\'", - [](common_params & params) { - params.multiline_input = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"--in-prefix-bos"}, - "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](common_params & params) { - params.input_prefix_bos = true; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"--in-prefix"}, "STRING", - "string to prefix user inputs with (default: empty)", - [](common_params & params, const std::string & value) { - params.input_prefix = value; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(common_arg( - {"--in-suffix"}, "STRING", - "string to suffix after user inputs with (default: empty)", - [](common_params & params, const std::string & value) { - params.input_suffix = value; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(common_arg( - {"--no-warmup"}, - "skip warming up the model with an empty run", - [](common_params & params) { - params.warmup = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING})); - add_opt(common_arg( - {"--spm-infill"}, - string_format( - "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", - params.spm_infill ? "enabled" : "disabled" - ), - [](common_params & params) { - params.spm_infill = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); - add_opt(common_arg( - {"--samplers"}, "SAMPLERS", - string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](common_params & params, const std::string & value) { - const auto sampler_names = string_split(value, ';'); - params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); - } - ).set_sparam()); - add_opt(common_arg( - {"-s", "--seed"}, "SEED", - string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), - [](common_params & params, const std::string & value) { - params.sampling.seed = std::stoul(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", - string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](common_params & params, const std::string & value) { - params.sampling.samplers = common_sampler_types_from_chars(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--ignore-eos"}, - "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](common_params & params) { - params.sampling.ignore_eos = true; - } - ).set_sparam()); - add_opt(common_arg( - {"--temp"}, "N", - string_format("temperature (default: %.1f)", (double)params.sampling.temp), - [](common_params & params, const std::string & value) { - params.sampling.temp = std::stof(value); - params.sampling.temp = std::max(params.sampling.temp, 0.0f); - } - ).set_sparam()); - add_opt(common_arg( - {"--top-k"}, "N", - string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), - [](common_params & params, int value) { - params.sampling.top_k = value; - } - ).set_sparam()); - add_opt(common_arg( - {"--top-p"}, "N", - string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), - [](common_params & params, const std::string & value) { - params.sampling.top_p = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--min-p"}, "N", - string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), - [](common_params & params, const std::string & value) { - params.sampling.min_p = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--xtc-probability"}, "N", - string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), - [](common_params & params, const std::string & value) { - params.sampling.xtc_probability = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--xtc-threshold"}, "N", - string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), - [](common_params & params, const std::string & value) { - params.sampling.xtc_threshold = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--typical"}, "N", - string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), - [](common_params & params, const std::string & value) { - params.sampling.typ_p = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--repeat-last-n"}, "N", - string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), - [](common_params & params, int value) { - if (value < -1) { - throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value)); - } - params.sampling.penalty_last_n = value; - params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); - } - ).set_sparam()); - add_opt(common_arg( - {"--repeat-penalty"}, "N", - string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), - [](common_params & params, const std::string & value) { - params.sampling.penalty_repeat = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--presence-penalty"}, "N", - string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), - [](common_params & params, const std::string & value) { - params.sampling.penalty_present = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--frequency-penalty"}, "N", - string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), - [](common_params & params, const std::string & value) { - params.sampling.penalty_freq = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--dry-multiplier"}, "N", - string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), - [](common_params & params, const std::string & value) { - params.sampling.dry_multiplier = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--dry-base"}, "N", - string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), - [](common_params & params, const std::string & value) { - float potential_base = std::stof(value); - if (potential_base >= 1.0f) - { - params.sampling.dry_base = potential_base; - } - } - ).set_sparam()); - add_opt(common_arg( - {"--dry-allowed-length"}, "N", - string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), - [](common_params & params, int value) { - params.sampling.dry_allowed_length = value; - } - ).set_sparam()); - add_opt(common_arg( - {"--dry-penalty-last-n"}, "N", - string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), - [](common_params & params, int value) { - if (value < -1) { - throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value)); - } - params.sampling.dry_penalty_last_n = value; - } - ).set_sparam()); - add_opt(common_arg( - {"--dry-sequence-breaker"}, "STRING", - string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", - params.sampling.dry_sequence_breakers.empty() ? "none" : - std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()), - params.sampling.dry_sequence_breakers.end(), - std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'", - [](const std::string& a, const std::string& b) { - std::string formatted_b = (b == "\n") ? "\\n" : b; - return a + ", '" + formatted_b + "'"; - }).c_str()), - [](common_params & params, const std::string & value) { - static bool defaults_cleared = false; - - if (!defaults_cleared) { - params.sampling.dry_sequence_breakers.clear(); - defaults_cleared = true; - } - - if (value == "none") { - params.sampling.dry_sequence_breakers.clear(); - } else { - params.sampling.dry_sequence_breakers.emplace_back(value); - } - } - ).set_sparam()); - add_opt(common_arg( - {"--dynatemp-range"}, "N", - string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), - [](common_params & params, const std::string & value) { - params.sampling.dynatemp_range = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--dynatemp-exp"}, "N", - string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), - [](common_params & params, const std::string & value) { - params.sampling.dynatemp_exponent = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--mirostat"}, "N", - string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), - [](common_params & params, int value) { - params.sampling.mirostat = value; - } - ).set_sparam()); - add_opt(common_arg( - {"--mirostat-lr"}, "N", - string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), - [](common_params & params, const std::string & value) { - params.sampling.mirostat_eta = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"--mirostat-ent"}, "N", - string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), - [](common_params & params, const std::string & value) { - params.sampling.mirostat_tau = std::stof(value); - } - ).set_sparam()); - add_opt(common_arg( - {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", - "modifies the likelihood of token appearing in the completion,\n" - "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" - "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](common_params & params, const std::string & value) { - std::stringstream ss(value); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sampling.logit_bias.push_back({key, bias}); - } else { - throw std::invalid_argument("invalid input format"); - } - } catch (const std::exception&) { - throw std::invalid_argument("invalid input format"); - } - } - ).set_sparam()); - add_opt(common_arg( - {"--grammar"}, "GRAMMAR", - string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), - [](common_params & params, const std::string & value) { - params.sampling.grammar = value; - } - ).set_sparam()); - add_opt(common_arg( - {"--grammar-file"}, "FNAME", - "file to read grammar from", - [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(params.sampling.grammar) - ); - } - ).set_sparam()); - add_opt(common_arg( - {"-j", "--json-schema"}, "SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](common_params & params, const std::string & value) { - params.sampling.grammar = json_schema_to_grammar(json::parse(value)); - } - ).set_sparam()); - add_opt(common_arg( - {"--pooling"}, "{none,mean,cls,last,rank}", - "pooling type for embeddings, use model default if unspecified", - [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); - add_opt(common_arg( - {"--attention"}, "{causal,non-causal}", - "attention type for embeddings, use model default if unspecified", - [](common_params & params, const std::string & value) { - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(common_arg( - {"--rope-scaling"}, "{none,linear,yarn}", - "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); - add_opt(common_arg( - {"--rope-scale"}, "N", - "RoPE context scaling factor, expands context by a factor of N", - [](common_params & params, const std::string & value) { - params.rope_freq_scale = 1.0f / std::stof(value); - } - ).set_env("LLAMA_ARG_ROPE_SCALE")); - add_opt(common_arg( - {"--rope-freq-base"}, "N", - "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](common_params & params, const std::string & value) { - params.rope_freq_base = std::stof(value); - } - ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); - add_opt(common_arg( - {"--rope-freq-scale"}, "N", - "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](common_params & params, const std::string & value) { - params.rope_freq_scale = std::stof(value); - } - ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); - add_opt(common_arg( - {"--yarn-orig-ctx"}, "N", - string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](common_params & params, int value) { - params.yarn_orig_ctx = value; - } - ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); - add_opt(common_arg( - {"--yarn-ext-factor"}, "N", - string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](common_params & params, const std::string & value) { - params.yarn_ext_factor = std::stof(value); - } - ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); - add_opt(common_arg( - {"--yarn-attn-factor"}, "N", - string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](common_params & params, const std::string & value) { - params.yarn_attn_factor = std::stof(value); - } - ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); - add_opt(common_arg( - {"--yarn-beta-slow"}, "N", - string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](common_params & params, const std::string & value) { - params.yarn_beta_slow = std::stof(value); - } - ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); - add_opt(common_arg( - {"--yarn-beta-fast"}, "N", - string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](common_params & params, const std::string & value) { - params.yarn_beta_fast = std::stof(value); - } - ).set_env("LLAMA_ARG_YARN_BETA_FAST")); - add_opt(common_arg( - {"-gan", "--grp-attn-n"}, "N", - string_format("group-attention factor (default: %d)", params.grp_attn_n), - [](common_params & params, int value) { - params.grp_attn_n = value; - } - ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); - add_opt(common_arg( - {"-gaw", "--grp-attn-w"}, "N", - string_format("group-attention width (default: %d)", params.grp_attn_w), - [](common_params & params, int value) { - params.grp_attn_w = value; - } - ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(common_arg( - {"-dkvc", "--dump-kv-cache"}, - "verbose print of the KV cache", - [](common_params & params) { - params.dump_kv_cache = true; - } - )); - add_opt(common_arg( - {"-nkvo", "--no-kv-offload"}, - "disable KV offload", - [](common_params & params) { - params.no_kv_offload = true; - } - ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); - add_opt(common_arg( - {"-ctk", "--cache-type-k"}, "TYPE", - string_format( - "KV cache data type for K\n" - "allowed values: %s\n" - "(default: %s)", - get_all_kv_cache_types().c_str(), - ggml_type_name(params.cache_type_k) - ), - [](common_params & params, const std::string & value) { - params.cache_type_k = kv_cache_type_from_str(value); - } - ).set_env("LLAMA_ARG_CACHE_TYPE_K")); - add_opt(common_arg( - {"-ctv", "--cache-type-v"}, "TYPE", - string_format( - "KV cache data type for V\n" - "allowed values: %s\n" - "(default: %s)", - get_all_kv_cache_types().c_str(), - ggml_type_name(params.cache_type_v) - ), - [](common_params & params, const std::string & value) { - params.cache_type_v = kv_cache_type_from_str(value); - } - ).set_env("LLAMA_ARG_CACHE_TYPE_V")); - add_opt(common_arg( - {"--perplexity", "--all-logits"}, - string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](common_params & params) { - params.logits_all = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--hellaswag"}, - "compute HellaSwag score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.hellaswag = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--hellaswag-tasks"}, "N", - string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](common_params & params, int value) { - params.hellaswag_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--winogrande"}, - "compute Winogrande score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.winogrande = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--winogrande-tasks"}, "N", - string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](common_params & params, int value) { - params.winogrande_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--multiple-choice"}, - "compute multiple choice score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.multiple_choice = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--multiple-choice-tasks"}, "N", - string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](common_params & params, int value) { - params.multiple_choice_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--kl-divergence"}, - "computes KL-divergence to logits provided via --kl-divergence-base", - [](common_params & params) { - params.kl_divergence = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--save-all-logits", "--kl-divergence-base"}, "FNAME", - "set logits file", - [](common_params & params, const std::string & value) { - params.logits_file = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--ppl-stride"}, "N", - string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](common_params & params, int value) { - params.ppl_stride = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"--ppl-output-type"}, "<0|1>", - string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](common_params & params, int value) { - params.ppl_output_type = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(common_arg( - {"-dt", "--defrag-thold"}, "N", - string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](common_params & params, const std::string & value) { - params.defrag_thold = std::stof(value); - } - ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(common_arg( - {"-np", "--parallel"}, "N", - string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](common_params & params, int value) { - params.n_parallel = value; - } - ).set_env("LLAMA_ARG_N_PARALLEL")); - add_opt(common_arg( - {"-ns", "--sequences"}, "N", - string_format("number of sequences to decode (default: %d)", params.n_sequences), - [](common_params & params, int value) { - params.n_sequences = value; - } - ).set_examples({LLAMA_EXAMPLE_PARALLEL})); - add_opt(common_arg( - {"-cb", "--cont-batching"}, - string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](common_params & params) { - params.cont_batching = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); - add_opt(common_arg( - {"-nocb", "--no-cont-batching"}, - "disable continuous batching", - [](common_params & params) { - params.cont_batching = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); - add_opt(common_arg( - {"--mmproj"}, "FILE", - "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](common_params & params, const std::string & value) { - params.mmproj = value; - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); - add_opt(common_arg( - {"--image"}, "FILE", - "path to an image file. use with multimodal models. Specify multiple times for batching", - [](common_params & params, const std::string & value) { - params.image.emplace_back(value); - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); - if (llama_supports_rpc()) { - add_opt(common_arg( - {"--rpc"}, "SERVERS", - "comma separated list of RPC servers", - [](common_params & params, const std::string & value) { - add_rpc_devices(value); - GGML_UNUSED(params); - } - ).set_env("LLAMA_ARG_RPC")); - } - add_opt(common_arg( - {"--mlock"}, - "force system to keep model in RAM rather than swapping or compressing", - [](common_params & params) { - params.use_mlock = true; - } - ).set_env("LLAMA_ARG_MLOCK")); - add_opt(common_arg( - {"--no-mmap"}, - "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](common_params & params) { - params.use_mmap = false; - } - ).set_env("LLAMA_ARG_NO_MMAP")); - add_opt(common_arg( - {"--numa"}, "TYPE", - "attempt optimizations that help on some NUMA systems\n" - "- distribute: spread execution evenly over all nodes\n" - "- isolate: only spawn threads on CPUs on the node that execution started on\n" - "- numactl: use the CPU map provided by numactl\n" - "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](common_params & params, const std::string & value) { - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_env("LLAMA_ARG_NUMA")); - add_opt(common_arg( - {"-dev", "--device"}, "", - "comma-separated list of devices to use for offloading (none = don't offload)\n" - "use --list-devices to see a list of available devices", - [](common_params & params, const std::string & value) { - params.devices = parse_device_list(value); - } - ).set_env("LLAMA_ARG_DEVICE")); - add_opt(common_arg( - {"--list-devices"}, - "print list of available devices and exit", - [](common_params &) { - std::vector rpc_devices; - std::vector all_devices; - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); - if (ggml_backend_reg_name(reg) == std::string("RPC")) { - rpc_devices.push_back(dev); - } else { - all_devices.push_back(dev); - } - } - } - // insert RPC devices in front - all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end()); - printf("Available devices:\n"); - for (size_t i = 0; i < all_devices.size(); ++i) { - auto * dev = all_devices[i]; - size_t free, total; - ggml_backend_dev_memory(dev, &free, &total); - printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); - } - exit(0); - } - )); - add_opt(common_arg( - {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", - "number of layers to store in VRAM", - [](common_params & params, int value) { - params.n_gpu_layers = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); - fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); - } - } - ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(common_arg( - {"-sm", "--split-mode"}, "{none,layer,row}", - "how to split the model across multiple GPUs, one of:\n" - "- none: use one GPU only\n" - "- layer (default): split layers and KV across GPUs\n" - "- row: split rows across GPUs", - [](common_params & params, const std::string & value) { - std::string arg_next = value; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } else { - throw std::invalid_argument("invalid value"); - } - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); - } - } - ).set_env("LLAMA_ARG_SPLIT_MODE")); - add_opt(common_arg( - {"-ts", "--tensor-split"}, "N0,N1,N2,...", - "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](common_params & params, const std::string & value) { - std::string arg_next = value; - - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { - throw std::invalid_argument( - string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) - ); - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } - } - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); - } - } - ).set_env("LLAMA_ARG_TENSOR_SPLIT")); - add_opt(common_arg( - {"-mg", "--main-gpu"}, "INDEX", - string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](common_params & params, int value) { - params.main_gpu = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); - } - } - ).set_env("LLAMA_ARG_MAIN_GPU")); - add_opt(common_arg( - {"--check-tensors"}, - string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](common_params & params) { - params.check_tensors = true; - } - )); - add_opt(common_arg( - {"--override-kv"}, "KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](common_params & params, const std::string & value) { - if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); - } - } - )); - add_opt(common_arg( - {"--lora"}, "FNAME", - "path to LoRA adapter (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); - } - // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(common_arg( - {"--lora-scaled"}, "FNAME", "SCALE", - "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); - } - // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(common_arg( - {"--control-vector"}, "FNAME", - "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](common_params & params, const std::string & value) { - params.control_vectors.push_back({ 1.0f, value, }); - } - )); - add_opt(common_arg( - {"--control-vector-scaled"}, "FNAME", "SCALE", - "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.control_vectors.push_back({ std::stof(scale), fname }); - } - )); - add_opt(common_arg( - {"--control-vector-layer-range"}, "START", "END", - "layer range to apply the control vector(s) to, start and end inclusive", - [](common_params & params, const std::string & start, const std::string & end) { - params.control_vector_layer_start = std::stoi(start); - params.control_vector_layer_end = std::stoi(end); - } - )); - add_opt(common_arg( - {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", - [](common_params & params, const std::string & value) { - params.model_alias = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); - add_opt(common_arg( - {"-m", "--model"}, "FNAME", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? std::string("model path from which to load base model") - : string_format( - "model path (default: `models/$filename` with filename from `--hf-file` " - "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH - ), - [](common_params & params, const std::string & value) { - params.model = value; - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(common_arg( - {"-mu", "--model-url"}, "MODEL_URL", - "model download url (default: unused)", - [](common_params & params, const std::string & value) { - params.model_url = value; - } - ).set_env("LLAMA_ARG_MODEL_URL")); - add_opt(common_arg( - {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", - "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" - "example: unsloth/phi-4-GGUF:q4_k_m\n" - "(default: unused)", - [](common_params & params, const std::string & value) { - params.hf_repo = value; - } - ).set_env("LLAMA_ARG_HF_REPO")); - add_opt(common_arg( - {"-hfd", "-hfrd", "--hf-repo-draft"}, "/[:quant]", - "Same as --hf-repo, but for the draft model (default: unused)", - [](common_params & params, const std::string & value) { - params.speculative.hf_repo = value; - } - ).set_env("LLAMA_ARG_HFD_REPO")); - add_opt(common_arg( - {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", - [](common_params & params, const std::string & value) { - params.hf_file = value; - } - ).set_env("LLAMA_ARG_HF_FILE")); - add_opt(common_arg( - {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", - "Hugging Face model repository for the vocoder model (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.hf_repo = value; - } - ).set_env("LLAMA_ARG_HF_REPO_V")); - add_opt(common_arg( - {"-hffv", "--hf-file-v"}, "FILE", - "Hugging Face model file for the vocoder model (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.hf_file = value; - } - ).set_env("LLAMA_ARG_HF_FILE_V")); - add_opt(common_arg( - {"-hft", "--hf-token"}, "TOKEN", - "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](common_params & params, const std::string & value) { - params.hf_token = value; - } - ).set_env("HF_TOKEN")); - add_opt(common_arg( - {"--context-file"}, "FNAME", - "file to load context from (repeat to specify multiple files)", - [](common_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - params.context_files.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(common_arg( - {"--chunk-size"}, "N", - string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](common_params & params, int value) { - params.chunk_size = value; - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(common_arg( - {"--chunk-separator"}, "STRING", - string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](common_params & params, const std::string & value) { - params.chunk_separator = value; - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(common_arg( - {"--junk"}, "N", - string_format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](common_params & params, int value) { - params.n_junk = value; - } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(common_arg( - {"--pos"}, "N", - string_format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](common_params & params, int value) { - params.i_pos = value; - } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(common_arg( - {"-o", "--output", "--output-file"}, "FNAME", - string_format("output file (default: '%s')", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? params.lora_outfile.c_str() - : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR - ? params.cvector_outfile.c_str() - : params.out_file.c_str()), - [](common_params & params, const std::string & value) { - params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(common_arg( - {"-ofreq", "--output-frequency"}, "N", - string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](common_params & params, int value) { - params.n_out_freq = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(common_arg( - {"--save-frequency"}, "N", - string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](common_params & params, int value) { - params.n_save_freq = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(common_arg( - {"--process-output"}, - string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](common_params & params) { - params.process_output = true; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(common_arg( - {"--no-ppl"}, - string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](common_params & params) { - params.compute_ppl = false; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(common_arg( - {"--chunk", "--from-chunk"}, "N", - string_format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](common_params & params, int value) { - params.i_chunk = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(common_arg( - {"-pps"}, - string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](common_params & params) { - params.is_pp_shared = true; - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(common_arg( - {"-npp"}, "n0,n1,...", - "number of prompt tokens", - [](common_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(common_arg( - {"-ntg"}, "n0,n1,...", - "number of text generation tokens", - [](common_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(common_arg( - {"-npl"}, "n0,n1,...", - "number of parallel prompts", - [](common_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(common_arg( - {"--embd-normalize"}, "N", - string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](common_params & params, int value) { - params.embd_normalize = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(common_arg( - {"--embd-output-format"}, "FORMAT", - "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](common_params & params, const std::string & value) { - params.embd_out = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(common_arg( - {"--embd-separator"}, "STRING", - "separator of embeddings (default \\n) for example \"<#sep#>\"", - [](common_params & params, const std::string & value) { - params.embd_sep = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(common_arg( - {"--host"}, "HOST", - string_format("ip address to listen (default: %s)", params.hostname.c_str()), - [](common_params & params, const std::string & value) { - params.hostname = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); - add_opt(common_arg( - {"--port"}, "PORT", - string_format("port to listen (default: %d)", params.port), - [](common_params & params, int value) { - params.port = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); - add_opt(common_arg( - {"--path"}, "PATH", - string_format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](common_params & params, const std::string & value) { - params.public_path = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); - add_opt(common_arg( - {"--no-webui"}, - string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), - [](common_params & params) { - params.webui = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); - add_opt(common_arg( - {"--embedding", "--embeddings"}, - string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](common_params & params) { - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); - add_opt(common_arg( - {"--reranking", "--rerank"}, - string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), - [](common_params & params) { - params.reranking = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); - add_opt(common_arg( - {"--api-key"}, "KEY", - "API key to use for authentication (default: none)", - [](common_params & params, const std::string & value) { - params.api_keys.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); - add_opt(common_arg( - {"--api-key-file"}, "FNAME", - "path to file containing API keys (default: none)", - [](common_params & params, const std::string & value) { - std::ifstream key_file(value); - if (!key_file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - std::string key; - while (std::getline(key_file, key)) { - if (!key.empty()) { - params.api_keys.push_back(key); - } - } - key_file.close(); - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--ssl-key-file"}, "FNAME", - "path to file a PEM-encoded SSL private key", - [](common_params & params, const std::string & value) { - params.ssl_file_key = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); - add_opt(common_arg( - {"--ssl-cert-file"}, "FNAME", - "path to file a PEM-encoded SSL certificate", - [](common_params & params, const std::string & value) { - params.ssl_file_cert = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); - add_opt(common_arg( - {"-to", "--timeout"}, "N", - string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](common_params & params, int value) { - params.timeout_read = value; - params.timeout_write = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); - add_opt(common_arg( - {"--threads-http"}, "N", - string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](common_params & params, int value) { - params.n_threads_http = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); - add_opt(common_arg( - {"--cache-reuse"}, "N", - string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), - [](common_params & params, int value) { - params.n_cache_reuse = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); - add_opt(common_arg( - {"--metrics"}, - string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_metrics = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); - add_opt(common_arg( - {"--slots"}, - string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_slots = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); - add_opt(common_arg( - {"--props"}, - string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_props = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); - add_opt(common_arg( - {"--no-slots"}, - "disables slots monitoring endpoint", - [](common_params & params) { - params.endpoint_slots = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); - add_opt(common_arg( - {"--slot-save-path"}, "PATH", - "path to save slot kv cache (default: disabled)", - [](common_params & params, const std::string & value) { - params.slot_save_path = value; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--jinja"}, - "use jinja template for chat (default: disabled)", - [](common_params & params) { - params.use_jinja = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); - add_opt(common_arg( - {"--chat-template"}, "JINJA_TEMPLATE", - string_format( - "set custom jinja chat template (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted (unless --jinja is set before this flag):\n" - "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() - ), - [](common_params & params, const std::string & value) { - params.chat_template = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); - add_opt(common_arg( - {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", - string_format( - "set custom jinja chat template file (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted (unless --jinja is set before this flag):\n" - "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() - ), - [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(params.chat_template)); - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); - add_opt(common_arg( - {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](common_params & params, const std::string & value) { - params.slot_prompt_similarity = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--lora-init-without-apply"}, - string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](common_params & params) { - params.lora_init_without_apply = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--simple-io"}, - "use basic IO for better compatibility in subprocesses and limited consoles", - [](common_params & params) { - params.simple_io = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(common_arg( - {"--positive-file"}, "FNAME", - string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](common_params & params, const std::string & value) { - params.cvector_positive_file = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(common_arg( - {"--negative-file"}, "FNAME", - string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](common_params & params, const std::string & value) { - params.cvector_negative_file = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(common_arg( - {"--pca-batch"}, "N", - string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](common_params & params, int value) { - params.n_pca_batch = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(common_arg( - {"--pca-iter"}, "N", - string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](common_params & params, int value) { - params.n_pca_iterations = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(common_arg( - {"--method"}, "{pca, mean}", - "dimensionality reduction method to be used (default: pca)", - [](common_params & params, const std::string & value) { - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(common_arg( - {"--output-format"}, "{md,jsonl}", - "output format for batched-bench results (default: md)", - [](common_params & params, const std::string & value) { - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(common_arg( - {"--log-disable"}, - "Log disable", - [](common_params &) { - common_log_pause(common_log_main()); - } - )); - add_opt(common_arg( - {"--log-file"}, "FNAME", - "Log to file", - [](common_params &, const std::string & value) { - common_log_set_file(common_log_main(), value.c_str()); - } - )); - add_opt(common_arg( - {"--log-colors"}, - "Enable colored logging", - [](common_params &) { - common_log_set_colors(common_log_main(), true); - } - ).set_env("LLAMA_LOG_COLORS")); - add_opt(common_arg( - {"-v", "--verbose", "--log-verbose"}, - "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", - [](common_params & params) { - params.verbosity = INT_MAX; - common_log_set_verbosity_thold(INT_MAX); - } - )); - add_opt(common_arg( - {"-lv", "--verbosity", "--log-verbosity"}, "N", - "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", - [](common_params & params, int value) { - params.verbosity = value; - common_log_set_verbosity_thold(value); - } - ).set_env("LLAMA_LOG_VERBOSITY")); - add_opt(common_arg( - {"--log-prefix"}, - "Enable prefx in log messages", - [](common_params &) { - common_log_set_prefix(common_log_main(), true); - } - ).set_env("LLAMA_LOG_PREFIX")); - add_opt(common_arg( - {"--log-timestamps"}, - "Enable timestamps in log messages", - [](common_params &) { - common_log_set_timestamps(common_log_main(), true); - } - ).set_env("LLAMA_LOG_TIMESTAMPS")); - - // speculative parameters - add_opt(common_arg( - {"-td", "--threads-draft"}, "N", - "number of threads to use during generation (default: same as --threads)", - [](common_params & params, int value) { - params.speculative.cpuparams.n_threads = value; - if (params.speculative.cpuparams.n_threads <= 0) { - params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-tbd", "--threads-batch-draft"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.n_threads = value; - if (params.speculative.cpuparams_batch.n_threads <= 0) { - params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Cd", "--cpu-mask-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](common_params & params, const std::string & range) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.speculative.cpuparams.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: same as --poll])", - [](common_params & params, int value) { - params.speculative.cpuparams.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](common_params & params, const std::string & range) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-batch-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-batch-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: --poll-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--draft-max", "--draft", "--draft-n"}, "N", - string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), - [](common_params & params, int value) { - params.speculative.n_max = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); - add_opt(common_arg( - {"--draft-min", "--draft-n-min"}, "N", - string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), - [](common_params & params, int value) { - params.speculative.n_min = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); - add_opt(common_arg( - {"--draft-p-split"}, "P", - string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), - [](common_params & params, const std::string & value) { - params.speculative.p_split = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); - add_opt(common_arg( - {"--draft-p-min"}, "P", - string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), - [](common_params & params, const std::string & value) { - params.speculative.p_min = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); - add_opt(common_arg( - {"-cd", "--ctx-size-draft"}, "N", - string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), - [](common_params & params, int value) { - params.speculative.n_ctx = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); - add_opt(common_arg( - {"-devd", "--device-draft"}, "", - "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" - "use --list-devices to see a list of available devices", - [](common_params & params, const std::string & value) { - params.speculative.devices = parse_device_list(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", - [](common_params & params, int value) { - params.speculative.n_gpu_layers = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); - fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); - add_opt(common_arg( - {"-md", "--model-draft"}, "FNAME", - "draft model for speculative decoding (default: unused)", - [](common_params & params, const std::string & value) { - params.speculative.model = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); - - add_opt(common_arg( - {"-mv", "--model-vocoder"}, "FNAME", - "vocoder model for audio generation (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.model = value; - } - ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--tts-use-guide-tokens"}, - "Use guide tokens to improve TTS word recall", - [](common_params & params) { - params.vocoder.use_guide_tokens = true; - } - ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); - - // model-specific - add_opt(common_arg( - {"--tts-oute-default"}, - string_format("use default OuteTTS models (note: can download weights from the internet)"), - [](common_params & params) { - params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; - params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; - params.vocoder.hf_repo = "ggml-org/WavTokenizer"; - params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; - } - ).set_examples({LLAMA_EXAMPLE_TTS})); - - add_opt(common_arg( - {"--embd-bge-small-en-default"}, - string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"), - [](common_params & params) { - params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; - params.hf_file = "bge-small-en-v1.5-q8_0.gguf"; - params.pooling_type = LLAMA_POOLING_TYPE_NONE; - params.embd_normalize = 2; - params.n_ctx = 512; - params.verbose_prompt = true; - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--embd-e5-small-en-default"}, - string_format("use default e5-small-v2 model (note: can download weights from the internet)"), - [](common_params & params) { - params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; - params.hf_file = "e5-small-v2-q8_0.gguf"; - params.pooling_type = LLAMA_POOLING_TYPE_NONE; - params.embd_normalize = 2; - params.n_ctx = 512; - params.verbose_prompt = true; - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--embd-gte-small-default"}, - string_format("use default gte-small model (note: can download weights from the internet)"), - [](common_params & params) { - params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; - params.hf_file = "gte-small-q8_0.gguf"; - params.pooling_type = LLAMA_POOLING_TYPE_NONE; - params.embd_normalize = 2; - params.n_ctx = 512; - params.verbose_prompt = true; - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); - - return ctx_arg; -} diff --git a/common/arg.h b/common/arg.h deleted file mode 100644 index 49ab8667b..000000000 --- a/common/arg.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include "common.h" - -#include -#include -#include - -// -// CLI argument parsing -// - -struct common_arg { - std::set examples = {LLAMA_EXAMPLE_COMMON}; - std::set excludes = {}; - std::vector args; - const char * value_hint = nullptr; // help text or example for arg value - const char * value_hint_2 = nullptr; // for second arg value - const char * env = nullptr; - std::string help; - bool is_sparam = false; // is current arg a sampling param? - void (*handler_void) (common_params & params) = nullptr; - void (*handler_string) (common_params & params, const std::string &) = nullptr; - void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; - void (*handler_int) (common_params & params, int) = nullptr; - - common_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, - void (*handler)(common_params & params, const std::string &) - ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - - common_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, - void (*handler)(common_params & params, int) - ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - - common_arg( - const std::initializer_list & args, - const std::string & help, - void (*handler)(common_params & params) - ) : args(args), help(help), handler_void(handler) {} - - // support 2 values for arg - common_arg( - const std::initializer_list & args, - const char * value_hint, - const char * value_hint_2, - const std::string & help, - void (*handler)(common_params & params, const std::string &, const std::string &) - ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - - common_arg & set_examples(std::initializer_list examples); - common_arg & set_excludes(std::initializer_list excludes); - common_arg & set_env(const char * env); - common_arg & set_sparam(); - bool in_example(enum llama_example ex); - bool is_exclude(enum llama_example ex); - bool get_value_from_env(std::string & output); - bool has_value_from_env(); - std::string to_string(); -}; - -struct common_params_context { - enum llama_example ex = LLAMA_EXAMPLE_COMMON; - common_params & params; - std::vector options; - void(*print_usage)(int, char **) = nullptr; - common_params_context(common_params & params) : params(params) {} -}; - -// parse input arguments from CLI -// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); - -// function to be used by test-arg-parser -common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/chat-template.hpp b/common/chat-template.hpp deleted file mode 100644 index 882ba41bd..000000000 --- a/common/chat-template.hpp +++ /dev/null @@ -1,529 +0,0 @@ -/* - Copyright 2024 Google LLC - - Use of this source code is governed by an MIT-style - license that can be found in the LICENSE file or at - https://opensource.org/licenses/MIT. -*/ -// SPDX-License-Identifier: MIT -#pragma once - -#include "minja.hpp" -#include -#include -#include - -using json = nlohmann::ordered_json; - -namespace minja { - -struct chat_template_caps { - bool supports_tools = false; - bool supports_tool_calls = false; - bool supports_tool_responses = false; - bool supports_system_role = false; - bool supports_parallel_tool_calls = false; - bool supports_tool_call_id = false; - // meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object. - // Most other templates (and OpenAI's API) expect the arguments object to be stringified. - bool requires_object_arguments = false; - // CohereForAI/c4ai-command-r-plus simple variant - bool requires_non_null_content = false; - // MiniMaxAI/MiniMax-Text-01 special - bool requires_typed_content = false; -}; - -struct chat_template_inputs { - nlohmann::ordered_json messages; - nlohmann::ordered_json tools; - bool add_generation_prompt = true; - nlohmann::ordered_json extra_context; - std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); -}; - -struct chat_template_options { - bool apply_polyfills = true; - bool use_bos_token = true; - bool use_eos_token = true; - bool define_strftime_now = true; - - bool polyfill_tools = true; - bool polyfill_tool_call_examples = true; - bool polyfill_tool_calls = true; - bool polyfill_tool_responses = true; - bool polyfill_system_role = true; - bool polyfill_object_arguments = true; - bool polyfill_typed_content = true; -}; - -class chat_template { - - private: - chat_template_caps caps_; - std::string source_; - std::string bos_token_; - std::string eos_token_; - std::shared_ptr template_root_; - std::string tool_call_example_; - - std::string try_raw_render( - const nlohmann::ordered_json & messages, - const nlohmann::ordered_json & tools, - bool add_generation_prompt, - const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const - { - try { - chat_template_inputs inputs; - inputs.messages = messages; - inputs.tools = tools; - inputs.add_generation_prompt = add_generation_prompt; - inputs.extra_context = extra_context; - // Use fixed date for tests - inputs.now = std::chrono::system_clock::from_time_t(0); - - chat_template_options opts; - opts.apply_polyfills = false; - - auto prompt = apply(inputs, opts); - // fprintf(stderr, "try_raw_render: %s\n", prompt.c_str()); - return prompt; - } catch (const std::exception & e) { - // fprintf(stderr, "try_raw_render error: %s\n", e.what()); - return ""; - } - } - - public: - - chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token) - : source_(source), bos_token_(bos_token), eos_token_(eos_token) - { - template_root_ = minja::Parser::parse(source_, { - /* .trim_blocks = */ true, - /* .lstrip_blocks = */ true, - /* .keep_trailing_newline = */ false, - }); - - auto contains = [](const std::string & haystack, const std::string & needle) { - return haystack.find(needle) != std::string::npos; - }; - - const std::string user_needle = ""; - const std::string sys_needle = ""; - const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}}; - const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}}; - - caps_.requires_typed_content = - !contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle) - && contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle); - - const auto dummy_user_msg = caps_.requires_typed_content - ? dummy_typed_user_msg - : dummy_str_user_msg; - const json needle_system_msg = { - {"role", "system"}, - {"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)}, - }; - - caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle); - - auto out = try_raw_render(json::array({ - dummy_user_msg - }), json::array({ - { - {"name", "some_tool"}, - {"type", "function"}, - {"function", { - {"name", "some_tool"}, - {"description", "Some tool."}, - {"parameters", { - {"type", "object"}, - {"properties", { - {"arg", { - {"type", "string"}, - {"description", "Some argument."}, - }}, - }}, - {"required", json::array({ "arg" })}, - }}, - }}, - }, - }), false); - caps_.supports_tools = contains(out, "some_tool"); - - auto make_tool_calls_msg = [&](const json & tool_calls) { - return json { - {"role", "assistant"}, - {"content", nullptr}, - {"tool_calls", tool_calls}, - }; - }; - auto make_tool_call = [](const std::string & tool_name, const json & arguments) { - return json { - {"id", "call_1___"}, - {"type", "function"}, - {"function", { - {"arguments", arguments}, - {"name", tool_name}, - }}, - }; - }; - const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}}; - - // Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want. - out = try_raw_render(json::array({ - dummy_user_msg, - make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})), - }), {}, false); - auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':"); - out = try_raw_render(json::array({ - dummy_user_msg, - make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})), - }), {}, false); - auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':"); - - caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments; - caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments; - auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false); - auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false); - caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle); - - if (caps_.supports_tool_calls) { - auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump()); - auto tc1 = make_tool_call("test_tool1", dummy_args); - auto tc2 = make_tool_call("test_tool2", dummy_args); - auto out = try_raw_render(json::array({ - dummy_user_msg, - make_tool_calls_msg(json::array({tc1, tc2})), - }), {}, false); - caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2"); - - out = try_raw_render(json::array({ - dummy_user_msg, - make_tool_calls_msg(json::array({tc1})), - { - {"role", "tool"}, - {"name", "test_tool1"}, - {"content", "Some response!"}, - {"tool_call_id", "call_911_"}, - } - }), {}, false); - caps_.supports_tool_responses = contains(out, "Some response!"); - caps_.supports_tool_call_id = contains(out, "call_911_"); - } - - try { - if (!caps_.supports_tools) { - const json user_msg { - {"role", "user"}, - {"content", "Hey"}, - }; - const json args { - {"arg1", "some_value"}, - }; - const json tool_call_msg { - {"role", "assistant"}, - {"content", nullptr}, - {"tool_calls", json::array({ - { - // TODO: detect if requires numerical id or fixed length == 6 like Nemo - {"id", "call_1___"}, - {"type", "function"}, - {"function", { - {"name", "tool_name"}, - {"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))}, - }}, - }, - })}, - }; - std::string prefix, full; - { - chat_template_inputs inputs; - inputs.messages = json::array({user_msg}); - inputs.add_generation_prompt = true; - prefix = apply(inputs); - } - { - chat_template_inputs inputs; - inputs.messages = json::array({user_msg, tool_call_msg}); - inputs.add_generation_prompt = false; - full = apply(inputs); - } - auto eos_pos_last = full.rfind(eos_token_); - if (eos_pos_last == prefix.size() - eos_token_.size() || - (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) { - full = full.substr(0, eos_pos_last); - } - size_t common_prefix_length = 0; - for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) { - if (prefix[i] != full[i]) { - break; - } - if (prefix[i] == '<') { - // DeepSeek R1's template (as of 20250209) adds a trailing if add_generation_prompt, - // but it removes thinking tags for past messages. - // The prefix and full strings diverge at vs. <|tool▁calls▁begin|>, we avoid consuming the leading <. - continue; - } - common_prefix_length = i + 1; - } - auto example = full.substr(common_prefix_length); - if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) { - fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n"); - } else { - tool_call_example_ = example; - } - } - } catch (const std::exception & e) { - fprintf(stderr, "Failed to generate tool call example: %s\n", e.what()); - } - } - - const std::string & source() const { return source_; } - const std::string & bos_token() const { return bos_token_; } - const std::string & eos_token() const { return eos_token_; } - const chat_template_caps & original_caps() const { return caps_; } - - // Deprecated, please use the form with chat_template_inputs and chat_template_options - std::string apply( - const nlohmann::ordered_json & messages, - const nlohmann::ordered_json & tools, - bool add_generation_prompt, - const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(), - bool apply_polyfills = true) - { - fprintf(stderr, "[%s] Deprecated!\n", __func__); - chat_template_inputs inputs; - inputs.messages = messages; - inputs.tools = tools; - inputs.add_generation_prompt = add_generation_prompt; - inputs.extra_context = extra_context; - inputs.now = std::chrono::system_clock::now(); - - chat_template_options opts; - opts.apply_polyfills = apply_polyfills; - - return apply(inputs, opts); - } - - std::string apply( - const chat_template_inputs & inputs, - const chat_template_options & opts = chat_template_options()) const - { - json actual_messages; - - auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); - auto has_tool_calls = false; - auto has_tool_responses = false; - auto has_string_content = false; - for (const auto & message : inputs.messages) { - if (message.contains("tool_calls") && !message["tool_calls"].is_null()) { - has_tool_calls = true; - } - if (message.contains("role") && message["role"] == "tool") { - has_tool_responses = true; - } - if (message.contains("content") && message["content"].is_string()) { - has_string_content = true; - } - } - - auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role; - auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools; - auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples; - auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls; - auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses; - auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments; - auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content; - - auto needs_polyfills = opts.apply_polyfills && (false - || polyfill_system_role - || polyfill_tools - || polyfill_tool_calls - || polyfill_tool_responses - || polyfill_object_arguments - || polyfill_typed_content - ); - - if (needs_polyfills) { - actual_messages = json::array(); - - auto add_message = [&](const json & msg) { - if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) { - actual_messages.push_back({ - {"role", msg.at("role")}, - {"content", {{ - {"type", "text"}, - {"text", msg.at("content")}, - }}}, - }); - } else { - actual_messages.push_back(msg); - } - }; - - std::string pending_system; - auto flush_sys = [&]() { - if (!pending_system.empty()) { - add_message({ - {"role", "user"}, - {"content", pending_system}, - }); - pending_system.clear(); - } - }; - - json adjusted_messages; - if (polyfill_tools) { - adjusted_messages = add_system(inputs.messages, - "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) + - (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n")); - } else { - adjusted_messages = inputs.messages; - } - - for (const auto & message_ : adjusted_messages) { - auto message = message_; - if (!message.contains("role") || !message.contains("content")) { - throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump()); - } - std::string role = message.at("role"); - - if (message.contains("tool_calls")) { - if (polyfill_object_arguments || polyfill_tool_calls) { - for (auto & tool_call : message.at("tool_calls")) { - if (tool_call["type"] == "function") { - auto & function = tool_call.at("function"); - auto & arguments = function.at("arguments"); - if (arguments.is_string()) { - try { - arguments = json::parse(arguments.get()); - } catch (const std::exception & ecvt) { - fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what()); - } - } - } - } - } - if (polyfill_tool_calls) { - auto content = message.at("content"); - auto tool_calls = json::array(); - for (const auto & tool_call : message.at("tool_calls")) { - if (tool_call.at("type") != "function") { - continue; - } - const auto & function = tool_call.at("function"); - auto tc = json { - {"name", function.at("name")}, - {"arguments", function.at("arguments")}, - }; - if (tool_call.contains("id")) { - tc["id"] = tool_call["id"]; - } - tool_calls.push_back(tc); - } - auto obj = json { - {"tool_calls", tool_calls}, - }; - if (!content.is_null() && content != "") { - obj["content"] = content; - } - message["content"] = obj.dump(2); - message.erase("tool_calls"); - } - } - if (polyfill_tool_responses && role == "tool") { - message["role"] = "user"; - auto obj = json { - {"tool_response", { - {"content", message.at("content")}, - }}, - }; - if (message.contains("name")) { - obj["tool_response"]["name"] = message.at("name"); - } - if (message.contains("tool_call_id")) { - obj["tool_response"]["tool_call_id"] = message.at("tool_call_id"); - } - message["content"] = obj.dump(2); - message.erase("name"); - } - - if (!message["content"].is_null() && polyfill_system_role) { - std::string content = message.at("content"); - if (role == "system") { - if (!pending_system.empty()) pending_system += "\n"; - pending_system += content; - continue; - } else { - if (role == "user") { - if (!pending_system.empty()) { - message["content"] = pending_system + (content.empty() ? "" : "\n" + content); - pending_system.clear(); - } - } else { - flush_sys(); - } - } - } - add_message(message); - } - flush_sys(); - } else { - actual_messages = inputs.messages; - } - - auto context = minja::Context::make(json({ - {"messages", actual_messages}, - {"add_generation_prompt", inputs.add_generation_prompt}, - })); - context->set("bos_token", opts.use_bos_token ? bos_token_ : ""); - context->set("eos_token", opts.use_eos_token ? eos_token_ : ""); - if (opts.define_strftime_now) { - auto now = inputs.now; - context->set("strftime_now", Value::callable([now](const std::shared_ptr &, minja::ArgumentsValue & args) { - args.expectArgs("strftime_now", {1, 1}, {0, 0}); - auto format = args.args[0].get(); - - auto time = std::chrono::system_clock::to_time_t(now); - auto local_time = *std::localtime(&time); - std::ostringstream ss; - ss << std::put_time(&local_time, format.c_str()); - return ss.str(); - })); - } - if (!inputs.tools.is_null()) { - context->set("tools", minja::Value(inputs.tools)); - } - if (!inputs.extra_context.is_null()) { - for (auto & kv : inputs.extra_context.items()) { - context->set(kv.key(), minja::Value(kv.value())); - } - } - - auto ret = template_root_->render(context); - // fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str()); - // fprintf(stderr, "apply: %s\n\n", ret.c_str()); - return ret; - } - - static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) { - json messages_with_system = messages; - - if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") { - std::string existing_system = messages_with_system.at(0).at("content"); - messages_with_system[0] = json { - {"role", "system"}, - {"content", existing_system + "\n\n" + system_prompt}, - }; - } else { - messages_with_system.insert(messages_with_system.begin(), json { - {"role", "system"}, - {"content", system_prompt}, - }); - } - return messages_with_system; - } -}; - -} // namespace minja diff --git a/common/chat.cpp b/common/chat.cpp deleted file mode 100644 index ef1c6fb3d..000000000 --- a/common/chat.cpp +++ /dev/null @@ -1,966 +0,0 @@ -#include "chat.hpp" -#include "chat-template.hpp" -#include "json-schema-to-grammar.h" -#include "log.h" -#include "minja.hpp" - -std::string common_chat_format_name(common_chat_format format) { - switch (format) { - case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only"; - case COMMON_CHAT_FORMAT_GENERIC: return "Generic"; - case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo"; - case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x"; - case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools"; - case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1"; - case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2"; - case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2"; - case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1"; - case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro"; - case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B"; - default: - throw std::runtime_error("Unknown chat format"); - } -} - -const common_grammar_options grammar_options { - /* .dotall = */ false, - /* .compact_spaces = */ false, - // /* .compact_spaces = */ true, -}; - -static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) { - // // https://json.nlohmann.me/features/parsing/sax_interface/ - struct json_error_locator : public nlohmann::json_sax { - std::size_t position; - bool found_error; - - json_error_locator() : position(0), found_error(false) {} - - bool parse_error(std::size_t position, const std::string &, const json::exception &) override { - this->position = position - 1; - this->found_error = true; - return false; - } - bool null() override { return true; } - bool boolean(bool) override { return true; } - bool number_integer(number_integer_t) override { return true; } - bool number_unsigned(number_unsigned_t) override { return true; } - bool number_float(number_float_t, const string_t &) override { return true; } - bool string(string_t &) override { return true; } - bool binary(binary_t &) override { return true; } - bool start_object(std::size_t) override { return true; } - bool key(string_t &) override { return true; } - bool end_object() override { return true; } - bool start_array(std::size_t) override { return true; } - bool end_array() override { return true; } - }; - json_error_locator err_loc; - json::sax_parse(it, end, &err_loc); - - std::string::const_iterator temptative_end; - if (err_loc.found_error) { - temptative_end = it + err_loc.position; - } else { - temptative_end = end; - } - std::string json_sub {it, temptative_end}; - try { - out = json::parse(json_sub); - it = temptative_end; - return true; - } catch (const std::exception &) { - return false; - } -} - - -/** - * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between. - * Aggregates the prefix, suffix and in-between text into the content. - */ -static common_chat_msg parse_json_tool_calls( - const std::string& input, - const std::optional & trigger_opt, - const std::regex & function_regex, - const std::regex & close_regex) { - std::smatch match; - - common_chat_msg result; - result.role = "assistant"; - - - auto end = input.end(); - auto it = input.begin(); - - if (trigger_opt) { - if (!std::regex_search(it, end, match, *trigger_opt)) { - result.content = input; - return result; - } - result.content = match.prefix().str(); - it = match.suffix().first; - } - - while (it != end) { - std::sregex_iterator rend; - std::sregex_iterator rit(it, end, function_regex); - if (rit == rend) { - fprintf(stderr, "No more tool calls found\n"); - result.content += std::string(it, end); - break; - } - auto name = rit->str(1); - result.content += std::string(it, rit->prefix().second); - it = rit->suffix().first; - - json arguments; - if (!parse_json(it, end, arguments)) { - throw std::runtime_error("Failed to parse json tool call arguments"); - } - if (!std::regex_search(it, end, match, close_regex)) { - throw std::runtime_error("Malformed input, missing closing pattern"); - } - it = match.suffix().first; - result.tool_calls.push_back({name, arguments.is_string() ? arguments.get() : arguments.dump(), /* id= */ ""}); - } - return result; -} - -static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) { - auto content_end = input.find(prefix); - size_t tc_start = std::string::npos; - - common_chat_msg result; - result.role = "assistant"; - const auto process_tool_calls = [&](const json & tool_calls) { - for (const auto & tool_call : tool_calls) { - const auto & arguments = tool_call["arguments"]; - result.tool_calls.push_back({ - tool_call["name"], - arguments.is_string() ? arguments.get() : arguments.dump(), - tool_call.contains("id") ? tool_call["id"] : "", - }); - } - }; - if (content_end == std::string::npos) { - result.content = input; - } else { - tc_start = content_end + prefix.size() - rstrip_prefix; - result.content = input.substr(0, content_end); - auto tool_calls = json::parse(input.substr(tc_start)); - process_tool_calls(tool_calls); - } - return result; -} - -static void foreach_function(const json & tools, const std::function & fn) { - for (const auto & tool : tools) { - if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) { - LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str()); - continue; - } - fn(tool); - } -} - -static std::string apply( - const common_chat_template & tmpl, - const nlohmann::ordered_json & messages, - const nlohmann::ordered_json & tools, - bool add_generation_prompt, - const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) -{ - minja::chat_template_inputs tmpl_inputs; - tmpl_inputs.messages = messages; - tmpl_inputs.tools = tools; - tmpl_inputs.add_generation_prompt = add_generation_prompt; - tmpl_inputs.extra_context = extra_context; - // TODO: add flag to control date/time, if only for testing purposes. - // tmpl_inputs.now = std::chrono::system_clock::now(); - - minja::chat_template_options tmpl_opts; - tmpl_opts.use_bos_token = false; - tmpl_opts.use_eos_token = false; - - return tmpl.apply(tmpl_inputs, tmpl_opts); -} - -static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - common_chat_params data; - - auto tool_call_schemas = json::array(); - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - auto tool_schema = json { - {"type", "object"}, - {"properties", { - {"name", { - {"type", "string"}, - {"const", function["name"]}, - }}, - {"arguments", function["parameters"]}, - }}, - {"required", json::array({"name", "arguments"})}, - }; - if (function.contains("description")) { - tool_schema["description"] = function["description"]; - } - if (inputs.parallel_tool_calls) { - tool_schema["properties"]["id"] = { - {"type", "string"}, - {"minLength", 4}, - }; - tool_schema["required"].push_back("id"); - } - tool_call_schemas.emplace_back(tool_schema); - }); - const auto tool_call = - inputs.parallel_tool_calls - ? json { - {"type", "object"}, - {"properties", { - {"tool_calls", { - {"type", "array"}, - {"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json { - {"anyOf", tool_call_schemas}, - }}, - {"minItems", 1}, - }}, - }}, - {"required", json::array({"tool_calls"})}, - } - : json { - {"type", "object"}, - {"properties", { - {"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json { - {"anyOf", tool_call_schemas}, - }}, - }}, - {"required", json::array({"tool_call"})}, - }; - const auto schema = - inputs.tool_choice != "required" - ? json { - {"anyOf", json::array({ - tool_call, - { - {"type", "object"}, - {"properties", { - {"response", inputs.json_schema.is_null() - ? json {{"type", "string"}} - : inputs.json_schema - }, - }}, - {"required", json::array({"response"})}, - }, - })} - } - : tool_call; - - data.grammar_lazy = false; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - builder.add_schema("root", schema); - }, grammar_options); - - auto tweaked_messages = common_chat_template::add_system( - inputs.messages, - "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); - - data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_GENERIC; - return data; -} -static common_chat_msg common_chat_parse_generic(const std::string & input) { - json data = json::parse(input); - common_chat_msg result; - result.role = "assistant"; - if (data.contains("tool_calls")) { - for (const auto & tool_call : data["tool_calls"]) { - result.tool_calls.push_back({ - tool_call["name"], - tool_call["arguments"].dump(), - tool_call.contains("id") ? tool_call["id"] : "", - }); - } - } else if (data.contains("tool_call")) { - result.tool_calls.push_back({ - data["tool_call"]["name"], - data["tool_call"]["arguments"].dump(), - /* id= */ "", - }); - } else if (data.contains("response")) { - const auto & response = data["response"]; - result.content = response.is_string() ? response.get() : response.dump(2); - } - return result; -} - -static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - auto schemas = json::array(); - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - schemas.push_back({ - {"type", "object"}, - {"properties", { - // Important note: the model is probably trained to take a JSON stringified arguments value. - // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object. - {"name", { - {"type", "string"}, - {"const", function["name"]}, - }}, - {"arguments", function["parameters"]}, - {"id", { - {"type", "string"}, - // Nemo's template expects a 9-character alphanumeric ID. - {"pattern", "^[a-zA-Z0-9]{9}$"}, - }}, - }}, - {"required", json::array({"name", "arguments", "id"})}, - }); - }); - auto schema = json { - {"type", "array"}, - {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}}, - {"minItems", 1}, - }; - if (!inputs.parallel_tool_calls) { - schema["maxItems"] = 1; - } - builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema)); - }, grammar_options); - data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true}); - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; - return data; -} -static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) { - return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]"); -} - -static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - auto schemas = json::array(); - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - schemas.push_back({ - {"type", "object"}, - {"properties", { - {"tool_call_id", { - {"type", "string"}, - // Command-R's template expects an integer string. - {"pattern", "^[0-9]{1,10}$"}, - }}, - {"tool_name", { - {"type", "string"}, - {"const", function["name"]}, - }}, - {"parameters", function["parameters"]}, - }}, - {"required", json::array({"tool_call_id", "tool_name", "parameters"})}, - }); - }); - auto schema = json { - {"type", "array"}, - {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}}, - {"minItems", 1}, - }; - if (!inputs.parallel_tool_calls) { - schema["maxItems"] = 1; - } - builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\""); - }, grammar_options); - data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false}); - data.preserved_tokens = { - "<|START_RESPONSE|>", - "<|END_RESPONSE|>", - "<|START_THINKING|>", - "<|END_THINKING|>", - "<|END_ACTION|>", - }; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; - return data; -} -static common_chat_msg common_chat_parse_command_r7b(const std::string & input) { - static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>"); - static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>"); - std::smatch match; - - common_chat_msg result; - result.role = "assistant"; - if (std::regex_match(input, match, response_regex)) { - result.content = match[1].str(); - } else if (std::regex_match(input, match, thought_action_regex)) { - result.tool_plan = match[1].str(); - auto actions_str = match[2].str(); - auto actions = json::parse(actions_str); - for (const auto & action : actions) { - result.tool_calls.push_back({ - /* .name = */ action["tool_name"], - /* .arguments = */ action["parameters"].dump(), - /* .id = */ action["tool_call_id"], - }); - } - } else { - LOG_ERR("Failed to parse command_r output"); - result.content = input; - } - return result; -} - -static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector & expected_properties) { - if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) { - throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties"); - } - const auto & parameters_properties = parameters.at("properties"); - const auto & parameters_required = parameters.at("required"); - for (const auto & prop : expected_properties) { - if (!parameters_properties.contains(prop)) { - throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop); - } - if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) { - throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop); - } - } - if (parameters_properties.size() != expected_properties.size()) { - throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", ")); - } -} - -static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) { - auto builtin_tools = json::array(); - common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - - auto handle_builtin_tool = [&](const std::string & name, const json & parameters) { - if (name == "wolfram_alpha") { - // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py - expect_tool_parameters(name, parameters, {"query"}); - } else if (name == "web_search" || name == "brave_search") { - // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py - expect_tool_parameters(name, parameters, {"query"}); - } else if (name == "python" || name == "code_interpreter") { - // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py - expect_tool_parameters(name, parameters, {"code"}); - } else { - return false; - } - - std::vector kvs; - for (const auto & [key, value] : parameters.at("properties").items()) { - kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); - } - - tool_rules.push_back( - builder.add_rule( - name + "-call", - "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\"")); - builtin_tools.push_back(name); - - return true; - }; - - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; - builder.resolve_refs(parameters); - - // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime - if (allow_python_tag_builtin_tools) { - handle_builtin_tool(name, parameters); - } - tool_rules.push_back( - builder.add_rule( - name + "-call", - "\"{\" space " - "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? " - "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " + - builder.add_schema(name + "-args", parameters) + - " \"}\"")); - data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true}); - }); - data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true}); - if (!builtin_tools.empty()) { - data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false}); - } - builder.add_rule("root", string_join(tool_rules, " | ")); - }, grammar_options); - data.additional_stops.push_back("<|eom_id|>"); - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { - {"tools_in_user_message", false}, - {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, - }); - data.format = allow_python_tag_builtin_tools && !builtin_tools.empty() - ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS - : COMMON_CHAT_FORMAT_LLAMA_3_X; - return data; -} -static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) { - // TODO: tighten & simplify the parser, don't accept leading text context. - static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": "); - static std::regex close_regex("\\}"); - static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)"); - - if (with_builtin_tools) { - std::smatch match; - if (std::regex_match(input, match, builtin_call_regex)) { - auto name = match[1].str(); - auto raw_args = match[2].str(); - - // TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing. - auto it_eq = raw_args.find('='); - auto arg_name = raw_args.substr(0, it_eq); - auto arg_value_str = raw_args.substr(it_eq + 1); - auto arg_value = json::parse(arg_value_str); - - return { - /* .role = */ "assistant", - /* .content = */ match.prefix().str(), - /* .tool_calls = */ { - { - /* .name = */ match[1], - /* .arguments = */ (json { - {arg_name, arg_value}, - }).dump(), - /* .id = */ "", - }, - }, - }; - } - } - return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); -} - -static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; - auto args_rule = builder.add_schema(name + "-args", parameters); - tool_rules.push_back(builder.add_rule(name + "-call", - "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\"")); - }); - data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false}); - data.preserved_tokens = { - "<|tool▁sep|>", - "<|tool▁call▁end|>", - }; - builder.add_rule("root", "\"<|tool▁calls▁begin|>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space"); - }, grammar_options); - auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.prompt = prompt; - data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1; - return data; -} -static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) { - static std::regex trigger_regex("<|tool▁calls▁begin|>"); - static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n"); - static std::regex close_regex("```<|tool▁call▁end|>"); - return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex); -} - -static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - fprintf(stderr, "%s\n", __func__); - common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { - {"datetime", "Jan 29 2025 13:00:00 GMT"}, - {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, - }); - if (!inputs.tools.is_null() && !inputs.tools.empty()) { - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - auto schemas = json::array(); - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - schemas.push_back({ - {"type", "object"}, - {"properties", { - {"name", { - {"type", "string"}, - {"const", function["name"]}, - }}, - {"arguments", function["parameters"]}, - }}, - {"required", json::array({"name", "arguments", "id"})}, - }); - }); - auto schema = json { - {"type", "array"}, - {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}}, - {"minItems", 1}, - }; - if (!inputs.parallel_tool_calls) { - schema["maxItems"] = 1; - } - builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema)); - }, grammar_options); - data.grammar_triggers.push_back({" functools[", /* .at_start = */ false}); - data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2; - } else { - data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - } - return data; -} -static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) { - return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1); -} - -static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}... - // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar - common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; - if (!inputs.tools.is_null() && !inputs.tools.empty()) { - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector first_tool_rules; - std::vector subsequent_tool_rules; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; - auto args_rule = builder.add_schema(name + "-args", parameters); - first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule)); - subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule)); - data.grammar_triggers.push_back({name, /* .at_start = */ true}); - data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false}); - }); - auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space"; - if (inputs.parallel_tool_calls) { - auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space"; - builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*"); - } else { - builder.add_rule("root", first_rule); - } - - }, grammar_options); - } - return data; -} - -static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) { - auto expected_it = expected.begin(); - auto tmp_it = it; - while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) { - ++tmp_it; - ++expected_it; - } - if (expected_it == expected.end()) { - it = tmp_it; - return true; - } - return false; -} - -static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) { - static std::regex function_regex(R"((?:>>>)?(\w+)\n)"); - static std::regex close_regex(R"($|(?=>>>))"); - - std::string content; - auto it = input.begin(); - const auto end = input.end(); - - if (consume(it, end, "all\n")) { - std::smatch match; - if (std::regex_search(it, end, match, function_regex)) { - auto fun_it = match.prefix().second; - content = std::string(it, fun_it); - it = fun_it; - } else { - common_chat_msg res; - res.role = "assistant"; - res.content = std::string(it, end); - return res; - } - } - // TODO: tighten & simplify. - try { - auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex); - res.content = content + res.content; - return res; - } catch (const std::exception & e) { - LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what()); - common_chat_msg res; - res.role = "assistant"; - res.content = input; - return res; - } -} - -static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt - common_chat_params data; - json tools = inputs.tools.is_null() ? inputs.tools : json::array(); - std::string python_code_argument_name; - auto has_raw_python = false; - - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - const auto & parameters = function["parameters"]; - std::string name = function["name"]; - if (name == "python" || name == "ipython") { - if (!parameters.contains("type")) { - throw std::runtime_error("Missing type in python tool"); - } - has_raw_python = true; - auto type = parameters.at("type"); - if (type == "object") { - auto properties = parameters.at("properties"); - for (auto it = properties.begin(); it != properties.end(); ++it) { - if (it.value().at("type") == "string") { - if (!python_code_argument_name.empty()) { - throw std::runtime_error("Multiple string arguments found in python tool"); - } - python_code_argument_name = it.key(); - } - } - if (python_code_argument_name.empty()) { - throw std::runtime_error("No string argument found in python tool"); - } - } else if (type != "string") { - throw std::runtime_error("Invalid type in python tool: " + type.dump()); - } - } - tool_rules.push_back(builder.add_rule(name + "-call", "\"\" " + builder.add_schema(name + "-args", parameters) + " \"\" space")); - }); - if (has_raw_python) { - tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*")); - data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false}); - } - auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space"; - builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); - data.grammar_triggers.push_back({"([\s\S\n]*)$)"); - std::smatch match; - if (std::regex_search(input, match, python_tag_regex)) { - auto code = match[1].str(); - return { - /* .role = */ "assistant", - /* .content = */ match.prefix().str(), - /* .tool_calls = */ { - { - /* .name = */ "python", - /* .arguments = */ (json {{"code", code}}).dump(), - /* .id = */ "", - }, - } - }; - } - static std::regex function_regex(R"()"); - static std::regex close_regex(R"()"); - // TODO: tighten & simplify. - return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); -} - -static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - common_chat_params data; - // (content)?({"name": "foo", "arguments": {"a": 1}})* - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; - builder.resolve_refs(parameters); - tool_rules.push_back(builder.add_schema(name + "-call", { - {"type", "object"}, - {"properties", json { - {"name", json {{"const", name}}}, - {"arguments", parameters}, - }}, - {"required", json::array({"name", "arguments"})}, - })); - }); - auto tool_call = "\"\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"\" space"; - builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); - data.grammar_triggers.push_back({"", /* .at_start = */ false}); - data.preserved_tokens = { "" }; - }, grammar_options); - - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; - return data; -} -static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) { - try { - std::regex start_pattern(R"([\n\s]*)"); - std::regex middle_pattern(R"([\n\s]*[\n\s]*)"); - std::regex end_pattern(R"([\n\s]*[\n\s]*$)"); - - auto end = input.end(); - std::sregex_iterator rend; - std::sregex_iterator rit(input.begin(), end, start_pattern); - if (rit == rend) { - return { - /* .role = */ "assistant", - /* .content = */ input, - /* .tool_calls = */ {}, - }; - } - - common_chat_msg result; - result.role = "assistant"; - result.content = rit->prefix(); - - auto it = rit->suffix().first; - while (it != end) { - json call; - if (!parse_json(it, end, call)) { - throw std::runtime_error("Failed to parse json tool call"); - } - const auto & arguments = call["arguments"]; - result.tool_calls.push_back({ - call["name"], - arguments.dump(), - // arguments.is_string() ? arguments.get() : arguments.dump(), - /* id= */ "", - }); - rit = {it, end, middle_pattern}; - if (rit != rend) { - it = rit->suffix().first; - } else { - rit = {it, end, end_pattern}; - if (rit == rend) { - throw std::runtime_error("Malformed input, missing "); - } - break; - } - } - return result; - } catch (const std::exception & e) { - return { - /* .role = */ "assistant", - /* .content = */ input, - /* .tool_calls = */ {}, - }; - } -} - -static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - data.grammar_lazy = false; - if (!inputs.json_schema.is_null()) { - if (!inputs.grammar.empty()) { - throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); - } - data.grammar = json_schema_to_grammar(inputs.json_schema); - } else { - data.grammar = inputs.grammar.empty(); - } - return data; -} - -common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none"; - LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false"); - - if (has_tools && !inputs.grammar.empty()) { - throw std::runtime_error("Cannot specify grammar with tools"); - } - - const auto & src = tmpl.source(); - if (src.find(">>>all") != std::string::npos) { - // Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when - return common_chat_params_init_functionary_v3_2(tmpl, inputs); - } - if (src.find(" functools[") != std::string::npos) { - // Firefunction v2 requires datetime and functions in the context, even w/o tools. - return common_chat_params_init_firefunction_v2(tmpl, inputs); - } - - if (!has_tools) { - return common_chat_params_init_without_tools(tmpl, inputs); - } - - if (src.find("") != std::string::npos) { - return common_chat_params_init_hermes_2_pro(tmpl, inputs); - } - if (src.find("<|start_header_id|>") != std::string::npos - && src.find("ipython<|end_header_id|>") != std::string::npos) { - auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos; - return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools); - } - if (src.find("<|tool▁calls▁begin|>") != std::string::npos) { - return common_chat_params_init_deepseek_r1(tmpl, inputs); - } - if (src.find("[TOOL_CALLS]") != std::string::npos) { - return common_chat_params_init_mistral_nemo(tmpl, inputs); - } - if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) { - return common_chat_params_init_command_r7b(tmpl, inputs); - } - return common_chat_params_init_generic(tmpl, inputs); -} - -static common_chat_msg common_chat_parse_content_only(const std::string & input) { - return { - /* .role = */ "assistant", - /* .content = */ input, - /* .tool_calls = */ {}, - }; -} - -common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) { - switch (format) { - case COMMON_CHAT_FORMAT_CONTENT_ONLY: - return common_chat_parse_content_only(input); - case COMMON_CHAT_FORMAT_GENERIC: - return common_chat_parse_generic(input); - case COMMON_CHAT_FORMAT_MISTRAL_NEMO: - return common_chat_parse_mistral_nemo(input); - case COMMON_CHAT_FORMAT_LLAMA_3_X: - return common_chat_parse_llama_3_1(input); - case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: - return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true); - case COMMON_CHAT_FORMAT_DEEPSEEK_R1: - return common_chat_parse_deepseek_r1(input); - case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: - return common_chat_parse_functionary_v3_2(input); - case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: - return common_chat_parse_functionary_v3_1_llama_3_1(input); - case COMMON_CHAT_FORMAT_HERMES_2_PRO: - return common_chat_parse_hermes_2_pro(input); - case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: - return common_chat_parse_firefunction_v2(input); - case COMMON_CHAT_FORMAT_COMMAND_R7B: - return common_chat_parse_command_r7b(input); - default: - throw std::runtime_error("Unsupported format: " + common_chat_format_name(format)); - } -} diff --git a/common/chat.hpp b/common/chat.hpp deleted file mode 100644 index 33e64a430..000000000 --- a/common/chat.hpp +++ /dev/null @@ -1,52 +0,0 @@ -// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers. - -#pragma once - -#include "common.h" -#include -#include -#include -#include - -using json = nlohmann::ordered_json; - -struct common_chat_inputs { - json messages; - json tools; - json tool_choice; - json json_schema; - bool parallel_tool_calls; - bool stream; - std::string grammar; - bool add_generation_prompt = true; -}; - -enum common_chat_format { - COMMON_CHAT_FORMAT_CONTENT_ONLY, - COMMON_CHAT_FORMAT_GENERIC, - COMMON_CHAT_FORMAT_MISTRAL_NEMO, - COMMON_CHAT_FORMAT_LLAMA_3_X, - COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS, - COMMON_CHAT_FORMAT_DEEPSEEK_R1, - COMMON_CHAT_FORMAT_FIREFUNCTION_V2, - COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, - COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, - COMMON_CHAT_FORMAT_HERMES_2_PRO, - COMMON_CHAT_FORMAT_COMMAND_R7B, - - COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats -}; - -struct common_chat_params { - common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - json prompt; - std::string grammar; - bool grammar_lazy = false; - std::vector grammar_triggers; - std::vector preserved_tokens; - std::vector additional_stops; -}; - -struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params); -std::string common_chat_format_name(common_chat_format format); -common_chat_msg common_chat_parse( const std::string & input, common_chat_format format); diff --git a/common/common.cpp b/common/common.cpp index 8661e164a..4c19132f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2,35 +2,26 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif -#include "ggml.h" -#include "gguf.h" - #include "common.h" -#include "log.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" #include "json-schema-to-grammar.h" #include "llama.h" -#include "chat.hpp" -#include "chat-template.hpp" #include #include -#include #include #include #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -57,6 +48,7 @@ #if defined(LLAMA_USE_CURL) #include #include +#include #include #endif @@ -64,33 +56,23 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) +#define GGML_USE_CUDA_SYCL +#endif + +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUDA_SYCL_VULKAN +#endif + #if defined(LLAMA_USE_CURL) #ifdef __linux__ #include #elif defined(_WIN32) -# if !defined(PATH_MAX) -# define PATH_MAX MAX_PATH -# endif +#define PATH_MAX MAX_PATH #else #include #endif #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 - -// -// CURL utils -// - -using curl_ptr = std::unique_ptr; - -// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one -struct curl_slist_ptr { - struct curl_slist * ptr = nullptr; - ~curl_slist_ptr() { - if (ptr) { - curl_slist_free_all(ptr); - } - } -}; #endif // LLAMA_USE_CURL using json = nlohmann::ordered_json; @@ -128,34 +110,8 @@ int32_t cpu_get_num_physical_cores() { if (result == 0) { return num_physical_cores; } -#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later - // TODO: windows + arm64 + mingw64 - unsigned int n_threads_win = std::thread::hardware_concurrency(); - unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4; - - DWORD buffer_size = 0; - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) { - if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { - return default_threads; - } - } - - std::vector buffer(buffer_size); - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast(buffer.data()), &buffer_size)) { - return default_threads; - } - - int32_t num_physical_cores = 0; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast(buffer.data()); - while (buffer_size > 0) { - if (info->Relationship == RelationProcessorCore) { - num_physical_cores += info->Processor.GroupCount; - } - buffer_size -= info->Size; - info = reinterpret_cast(reinterpret_cast(info) + info->Size); - } - - return num_physical_cores > 0 ? num_physical_cores : default_threads; +#elif defined(_WIN32) + //TODO: Implement #endif unsigned int n_threads = std::thread::hardware_concurrency(); return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; @@ -234,189 +190,1528 @@ int32_t cpu_get_num_math() { return cpu_get_num_physical_cores(); } -// Helper for setting process priority - -#if defined(_WIN32) - -bool set_process_priority(enum ggml_sched_priority prio) { - if (prio == GGML_SCHED_PRIO_NORMAL) { - return true; - } - - DWORD p = NORMAL_PRIORITY_CLASS; - switch (prio) { - case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; - case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; - case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; - case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break; - } - - if (!SetPriorityClass(GetCurrentProcess(), p)) { - LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); - return false; - } - - return true; -} - -#else // MacOS and POSIX -#include -#include - -bool set_process_priority(enum ggml_sched_priority prio) { - if (prio == GGML_SCHED_PRIO_NORMAL) { - return true; - } - - int p = 0; - switch (prio) { - case GGML_SCHED_PRIO_NORMAL: p = 0; break; - case GGML_SCHED_PRIO_MEDIUM: p = -5; break; - case GGML_SCHED_PRIO_HIGH: p = -10; break; - case GGML_SCHED_PRIO_REALTIME: p = -20; break; - } - - if (!setpriority(PRIO_PROCESS, 0, p)) { - LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); - return false; - } - return true; -} - -#endif - // // CLI argument parsing // - -void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { - int32_t n_set = 0; - - if (cpuparams.n_threads < 0) { - // Assuming everything about cpuparams is invalid - if (role_model != nullptr) { - cpuparams = *role_model; - } else { - cpuparams.n_threads = cpu_get_num_math(); - } - } - - for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { - if (cpuparams.cpumask[i]) { - n_set++; - } - } - - if (n_set && n_set < cpuparams.n_threads) { - // Not enough set bits, may experience performance issues. - LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); +void gpt_params_handle_hf_token(gpt_params & params) { + if (params.hf_token.empty() && std::getenv("HF_TOKEN")) { + params.hf_token = std::getenv("HF_TOKEN"); } } -bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { - size_t dash_loc = range.find('-'); - if (dash_loc == std::string::npos) { - LOG_ERR("Format of CPU range is invalid! Expected []-[].\n"); +void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + } + params.hf_file = params.model; + } else if (params.model.empty()) { + params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + params.model = fs_get_cache_file(string_split(f, '/').back()); + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { + bool invalid_param = false; + std::string arg; + const std::string arg_prefix = "--"; + llama_sampling_params & sparams = params.sparams; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + } + + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + gpt_params_handle_model_default(params); + + gpt_params_handle_hf_token(params); + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + string_process_escapes(sparams.cfg_negative_prompt); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + return true; +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + const auto params_org = params; // the example can modify the default params + + try { + if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { + params = params_org; + params.usage = true; + return false; + } + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + params = params_org; return false; } - size_t start_i; - size_t end_i; - - if (dash_loc == 0) { - start_i = 0; - } else { - start_i = std::stoull(range.substr(0, dash_loc)); - if (start_i >= GGML_MAX_N_THREADS) { - LOG_ERR("Start index out of bounds!\n"); - return false; - } - } - - if (dash_loc == range.length() - 1) { - end_i = GGML_MAX_N_THREADS - 1; - } else { - end_i = std::stoull(range.substr(dash_loc + 1)); - if (end_i >= GGML_MAX_N_THREADS) { - LOG_ERR("End index out of bounds!\n"); - return false; - } - } - - for (size_t i = start_i; i <= end_i; i++) { - boolmask[i] = true; - } - return true; } -bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) { - // Discard potential 0x prefix - size_t start_i = 0; - if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { - start_i = 2; +#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } + +bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { + const char split_delim = ','; + + llama_sampling_params & sparams = params.sparams; + + if (arg == "-s" || arg == "--seed") { + CHECK_ARG + // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. + params.seed = std::stoul(argv[i]); + sparams.seed = std::stoul(argv[i]); + return true; } + if (arg == "-t" || arg == "--threads") { + CHECK_ARG + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tb" || arg == "--threads-batch") { + CHECK_ARG + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-td" || arg == "--threads-draft") { + CHECK_ARG + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + CHECK_ARG + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-p" || arg == "--prompt") { + CHECK_ARG + params.prompt = argv[i]; + return true; + } + if (arg == "-e" || arg == "--escape") { + params.escape = true; + return true; + } + if (arg == "--no-escape") { + params.escape = false; + return true; + } + if (arg == "--prompt-cache") { + CHECK_ARG + params.path_prompt_cache = argv[i]; + return true; + } + if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + return true; + } + if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + return true; + } + if (arg == "-bf" || arg == "--binary-file") { + CHECK_ARG + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); + return true; + } + if (arg == "-f" || arg == "--file") { + CHECK_ARG + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + return true; + } + if (arg == "--in-file") { + CHECK_ARG + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + params.in_files.push_back(argv[i]); + return true; + } + if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { + CHECK_ARG + params.n_predict = std::stoi(argv[i]); + return true; + } + if (arg == "--top-k") { + CHECK_ARG + sparams.top_k = std::stoi(argv[i]); + return true; + } + if (arg == "-c" || arg == "--ctx-size") { + CHECK_ARG + params.n_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-n" || arg == "-gan") { + CHECK_ARG + params.grp_attn_n = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + CHECK_ARG + params.grp_attn_w = std::stoi(argv[i]); + return true; + } + if (arg == "--rope-freq-base") { + CHECK_ARG + params.rope_freq_base = std::stof(argv[i]); + return true; + } + if (arg == "--rope-freq-scale") { + CHECK_ARG + params.rope_freq_scale = std::stof(argv[i]); + return true; + } + if (arg == "--rope-scaling") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { invalid_param = true; } + return true; + } + if (arg == "--rope-scale") { + CHECK_ARG + params.rope_freq_scale = 1.0f / std::stof(argv[i]); + return true; + } + if (arg == "--yarn-orig-ctx") { + CHECK_ARG + params.yarn_orig_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--yarn-ext-factor") { + CHECK_ARG + params.yarn_ext_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-attn-factor") { + CHECK_ARG + params.yarn_attn_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-fast") { + CHECK_ARG + params.yarn_beta_fast = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-slow") { + CHECK_ARG + params.yarn_beta_slow = std::stof(argv[i]); + return true; + } + if (arg == "--pooling") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { invalid_param = true; } + return true; + } + if (arg == "--attention") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { invalid_param = true; } + return true; + } + if (arg == "--defrag-thold" || arg == "-dt") { + CHECK_ARG + params.defrag_thold = std::stof(argv[i]); + return true; + } + if (arg == "--samplers") { + CHECK_ARG + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); + return true; + } + if (arg == "--sampling-seq") { + CHECK_ARG + sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); + return true; + } + if (arg == "--top-p") { + CHECK_ARG + sparams.top_p = std::stof(argv[i]); + return true; + } + if (arg == "--min-p") { + CHECK_ARG + sparams.min_p = std::stof(argv[i]); + return true; + } + if (arg == "--temp") { + CHECK_ARG + sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); + return true; + } + if (arg == "--tfs") { + CHECK_ARG + sparams.tfs_z = std::stof(argv[i]); + return true; + } + if (arg == "--typical") { + CHECK_ARG + sparams.typical_p = std::stof(argv[i]); + return true; + } + if (arg == "--repeat-last-n") { + CHECK_ARG + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + return true; + } + if (arg == "--repeat-penalty") { + CHECK_ARG + sparams.penalty_repeat = std::stof(argv[i]); + return true; + } + if (arg == "--frequency-penalty") { + CHECK_ARG + sparams.penalty_freq = std::stof(argv[i]); + return true; + } + if (arg == "--presence-penalty") { + CHECK_ARG + sparams.penalty_present = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-range") { + CHECK_ARG + sparams.dynatemp_range = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-exp") { + CHECK_ARG + sparams.dynatemp_exponent = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat") { + CHECK_ARG + sparams.mirostat = std::stoi(argv[i]); + return true; + } + if (arg == "--mirostat-lr") { + CHECK_ARG + sparams.mirostat_eta = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat-ent") { + CHECK_ARG + sparams.mirostat_tau = std::stof(argv[i]); + return true; + } + if (arg == "--cfg-negative-prompt") { + CHECK_ARG + sparams.cfg_negative_prompt = argv[i]; + return true; + } + if (arg == "--cfg-negative-prompt-file") { + CHECK_ARG + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); + } + return true; + } + if (arg == "--cfg-scale") { + CHECK_ARG + sparams.cfg_scale = std::stof(argv[i]); + return true; + } + if (arg == "-b" || arg == "--batch-size") { + CHECK_ARG + params.n_batch = std::stoi(argv[i]); + return true; + } + if (arg == "-ub" || arg == "--ubatch-size") { + CHECK_ARG + params.n_ubatch = std::stoi(argv[i]); + return true; + } + if (arg == "--keep") { + CHECK_ARG + params.n_keep = std::stoi(argv[i]); + return true; + } + if (arg == "--draft") { + CHECK_ARG + params.n_draft = std::stoi(argv[i]); + return true; + } + if (arg == "--chunks") { + CHECK_ARG + params.n_chunks = std::stoi(argv[i]); + return true; + } + if (arg == "-np" || arg == "--parallel") { + CHECK_ARG + params.n_parallel = std::stoi(argv[i]); + return true; + } + if (arg == "-ns" || arg == "--sequences") { + CHECK_ARG + params.n_sequences = std::stoi(argv[i]); + return true; + } + if (arg == "--p-split" || arg == "-ps") { + CHECK_ARG + params.p_split = std::stof(argv[i]); + return true; + } + if (arg == "-m" || arg == "--model") { + CHECK_ARG + params.model = argv[i]; + return true; + } + if (arg == "-md" || arg == "--model-draft") { + CHECK_ARG + params.model_draft = argv[i]; + return true; + } + if (arg == "-a" || arg == "--alias") { + CHECK_ARG + params.model_alias = argv[i]; + return true; + } + if (arg == "-mu" || arg == "--model-url") { + CHECK_ARG + params.model_url = argv[i]; + return true; + } + if (arg == "-hft" || arg == "--hf-token") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hf_token = argv[i]; + return true; + } + if (arg == "-hfr" || arg == "--hf-repo") { + CHECK_ARG + params.hf_repo = argv[i]; + return true; + } + if (arg == "-hff" || arg == "--hf-file") { + CHECK_ARG + params.hf_file = argv[i]; + return true; + } + if (arg == "--lora") { + CHECK_ARG + params.lora_adapter.emplace_back(argv[i], 1.0f); + return true; + } + if (arg == "--lora-scaled") { + CHECK_ARG + const char* lora_adapter = argv[i]; + CHECK_ARG + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + return true; + } + if (arg == "--control-vector") { + CHECK_ARG + params.control_vectors.push_back({ 1.0f, argv[i], }); + return true; + } + if (arg == "--control-vector-scaled") { + CHECK_ARG + const char* fname = argv[i]; + CHECK_ARG + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + return true; + } + if (arg == "--control-vector-layer-range") { + CHECK_ARG + params.control_vector_layer_start = std::stoi(argv[i]); + CHECK_ARG + params.control_vector_layer_end = std::stoi(argv[i]); + return true; + } + if (arg == "--mmproj") { + CHECK_ARG + params.mmproj = argv[i]; + return true; + } + if (arg == "--image") { + CHECK_ARG + params.image.emplace_back(argv[i]); + return true; + } + if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + return true; + } + if (arg == "-sp" || arg == "--special") { + params.special = true; + return true; + } + if (arg == "--embedding" || arg == "--embeddings") { + params.embedding = true; + return true; + } + if (arg == "--embd-normalize") { + CHECK_ARG + params.embd_normalize = std::stoi(argv[i]); + return true; + } + if (arg == "--embd-output-format") { + CHECK_ARG + params.embd_out = argv[i]; + return true; + } + if (arg == "--embd-separator") { + CHECK_ARG + params.embd_sep = argv[i]; + return true; + } + if (arg == "-if" || arg == "--interactive-first") { + params.interactive_first = true; + return true; + } + if (arg == "-cnv" || arg == "--conversation") { + params.conversation = true; + return true; + } + if (arg == "--infill") { + params.infill = true; + return true; + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + return true; + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + return true; + } + if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + return true; + } + if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + return true; + } + if (arg == "-mli" || arg == "--multiline-input") { + params.multiline_input = true; + return true; + } + if (arg == "--simple-io") { + params.simple_io = true; + return true; + } + if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + return true; + } + if (arg == "-nocb" || arg == "--no-cont-batching") { + params.cont_batching = false; + return true; + } + if (arg == "-fa" || arg == "--flash-attn") { + params.flash_attn = true; + return true; + } + if (arg == "-co" || arg == "--color") { + params.use_color = true; + return true; + } + if (arg == "--mlock") { + params.use_mlock = true; + return true; + } + if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { + CHECK_ARG + params.n_gpu_layers = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { + CHECK_ARG + params.n_gpu_layers_draft = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--main-gpu" || arg == "-mg") { + CHECK_ARG + params.main_gpu = std::stoi(argv[i]); +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + return true; + } + if (arg == "--split-mode" || arg == "-sm") { + CHECK_ARG + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } + else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + invalid_param = true; + return true; + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + return true; + } + if (arg == "--tensor-split" || arg == "-ts") { + CHECK_ARG + std::string arg_next = argv[i]; - size_t num_digits = mask.length() - start_i; - if (num_digits > 128) num_digits = 128; + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + invalid_param = true; + return true; + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } + else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + return true; + } + if (arg == "--rpc") { + CHECK_ARG + params.rpc_servers = argv[i]; + return true; + } + if (arg == "--no-mmap") { + params.use_mmap = false; + return true; + } + if (arg == "--numa") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; } + return true; + } + if (arg == "-v" || arg == "--verbose") { + params.verbosity = 1; + return true; + } + if (arg == "--verbosity") { + CHECK_ARG + params.verbosity = std::stoi(argv[i]); + return true; + } + if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + return true; + } + if (arg == "--no-display-prompt") { + params.display_prompt = false; + return true; + } + if (arg == "-r" || arg == "--reverse-prompt") { + CHECK_ARG + params.antiprompt.emplace_back(argv[i]); + return true; + } + if (arg == "-ld" || arg == "--logdir") { + CHECK_ARG + params.logdir = argv[i]; - size_t end_i = num_digits + start_i; - - for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) { - char c = mask.at(i); - int8_t id = c; - - if ((c >= '0' && c <= '9')) { - id -= '0'; - } else if (c >= 'a' && c <= 'f') { - id -= 'a' - 10; - } else if (c >= 'A' && c <= 'F') { - id -= 'A' - 10; + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "-lcs" || arg == "--lookup-cache-static") { + CHECK_ARG + params.lookup_cache_static = argv[i]; + return true; + } + if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { + CHECK_ARG + params.lookup_cache_dynamic = argv[i]; + return true; + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + CHECK_ARG + params.logits_file = argv[i]; + return true; + } + if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; + return true; + } + if (arg == "--ppl-stride") { + CHECK_ARG + params.ppl_stride = std::stoi(argv[i]); + return true; + } + if (arg == "--ppl-output-type") { + CHECK_ARG + params.ppl_output_type = std::stoi(argv[i]); + return true; + } + if (arg == "-ptc" || arg == "--print-token-count") { + CHECK_ARG + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--check-tensors") { + params.check_tensors = true; + return true; + } + if (arg == "--hellaswag") { + params.hellaswag = true; + return true; + } + if (arg == "--hellaswag-tasks") { + CHECK_ARG + params.hellaswag_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--winogrande") { + params.winogrande = true; + return true; + } + if (arg == "--winogrande-tasks") { + CHECK_ARG + params.winogrande_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--multiple-choice") { + params.multiple_choice = true; + return true; + } + if (arg == "--multiple-choice-tasks") { + CHECK_ARG + params.multiple_choice_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--kl-divergence") { + params.kl_divergence = true; + return true; + } + if (arg == "--ignore-eos") { + params.ignore_eos = true; + return true; + } + if (arg == "--penalize-nl") { + sparams.penalize_nl = true; + return true; + } + if (arg == "-l" || arg == "--logit-bias") { + CHECK_ARG + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } + else { + throw std::exception(); + } + } + catch (const std::exception&) { + invalid_param = true; + return true; + } + return true; + } + if (arg == "-h" || arg == "--help" || arg == "--usage" ) { + params.usage = true; + return true; + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--in-prefix-bos") { + params.input_prefix_bos = true; + params.enable_chat_template = false; + return true; + } + if (arg == "--in-prefix") { + CHECK_ARG + params.input_prefix = argv[i]; + params.enable_chat_template = false; + return true; + } + if (arg == "--in-suffix") { + CHECK_ARG + params.input_suffix = argv[i]; + params.enable_chat_template = false; + return true; + } + if (arg == "--spm-infill") { + params.spm_infill = true; + return true; + } + if (arg == "--grammar") { + CHECK_ARG + sparams.grammar = argv[i]; + return true; + } + if (arg == "--grammar-file") { + CHECK_ARG + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(sparams.grammar) + ); + return true; + } + if (arg == "-j" || arg == "--json-schema") { + CHECK_ARG + sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); + return true; + } + if (arg == "--override-kv") { + CHECK_ARG + if (!string_parse_kv_override(argv[i], params.kv_overrides)) { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + return true; + } + if (arg == "--host") { + CHECK_ARG + params.hostname = argv[i]; + return true; + } + if (arg == "--port") { + CHECK_ARG + params.port = std::stoi(argv[i]); + return true; + } + if (arg == "--path") { + CHECK_ARG + params.public_path = argv[i]; + return true; + } + if (arg == "--api-key") { + CHECK_ARG + params.api_keys.push_back(argv[i]); + return true; + } + if (arg == "--api-key-file") { + CHECK_ARG + std::ifstream key_file(argv[i]); + if (!key_file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + return true; + } + if (arg == "--ssl-key-file") { + CHECK_ARG + params.ssl_file_key = argv[i]; + return true; + } + if (arg == "--ssl-cert-file") { + CHECK_ARG + params.ssl_file_cert = argv[i]; + return true; + } + if (arg == "--timeout" || arg == "-to") { + CHECK_ARG + params.timeout_read = std::stoi(argv[i]); + params.timeout_write = std::stoi(argv[i]); + return true; + } + if (arg == "--threads-http") { + CHECK_ARG + params.n_threads_http = std::stoi(argv[i]); + return true; + } + if (arg == "-spf" || arg == "--system-prompt-file") { + CHECK_ARG + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + return true; + } + if (arg == "--log-format") { + CHECK_ARG + if (std::strcmp(argv[i], "json") == 0) { + params.log_json = true; + } else if (std::strcmp(argv[i], "text") == 0) { + params.log_json = false; } else { - LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i)); - return false; + invalid_param = true; + return true; } - - boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0); - boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0); - boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0); - boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0); + return true; } + if (arg == "--no-slots") { + params.endpoint_slots = false; + return true; + } + if (arg == "--metrics") { + params.endpoint_metrics = true; + return true; + } + if (arg == "--slot-save-path") { + CHECK_ARG + params.slot_save_path = argv[i]; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "--chat-template") { + CHECK_ARG + if (!llama_chat_verify_template(argv[i])) { + fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); + fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); + invalid_param = true; + return true; + } + params.chat_template = argv[i]; + return true; + } + if (arg == "--slot-prompt-similarity" || arg == "-sps") { + CHECK_ARG + params.slot_prompt_similarity = std::stof(argv[i]); + return true; + } + if (arg == "-pps") { + params.is_pp_shared = true; + return true; + } + if (arg == "-npp") { + CHECK_ARG + auto p = string_split(argv[i], split_delim); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + return true; + } + if (arg == "-ntg") { + CHECK_ARG + auto p = string_split(argv[i], split_delim); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + return true; + } + if (arg == "-npl") { + CHECK_ARG + auto p = string_split(argv[i], split_delim); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + return true; + } + if (arg == "--context-file") { + CHECK_ARG + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + params.context_files.push_back(argv[i]); + return true; + } + if (arg == "--chunk-size") { + CHECK_ARG + params.chunk_size = std::stoi(argv[i]); + return true; + } + if (arg == "--chunk-separator") { + CHECK_ARG + params.chunk_separator = argv[i]; + return true; + } + if (arg == "--junk") { + CHECK_ARG + params.n_junk = std::stoi(argv[i]); + return true; + } + if (arg == "--pos") { + CHECK_ARG + params.i_pos = std::stoi(argv[i]); + return true; + } + if (arg == "-o" || arg == "--output" || arg == "--output-file") { + CHECK_ARG + params.out_file = argv[i]; + params.cvector_outfile = argv[i]; + params.lora_outfile = argv[i]; + return true; + } + if (arg == "-ofreq" || arg == "--output-frequency") { + CHECK_ARG + params.n_out_freq = std::stoi(argv[i]); + return true; + } + if (arg == "--save-frequency") { + CHECK_ARG + params.n_save_freq = std::stoi(argv[i]); + return true; + } + if (arg == "--process-output") { + params.process_output = true; + return true; + } + if (arg == "--no-ppl") { + params.compute_ppl = false; + return true; + } + if (arg == "--chunk" || arg == "--from-chunk") { + CHECK_ARG + params.i_chunk = std::stoi(argv[i]); + return true; + } + // cvector params + if (arg == "--positive-file") { + CHECK_ARG + params.cvector_positive_file = argv[i]; + return true; + } + if (arg == "--negative-file") { + CHECK_ARG + params.cvector_negative_file = argv[i]; + return true; + } + if (arg == "--pca-batch") { + CHECK_ARG + params.n_pca_batch = std::stoi(argv[i]); + return true; + } + if (arg == "--pca-iter") { + CHECK_ARG + params.n_pca_iterations = std::stoi(argv[i]); + return true; + } + if (arg == "--method") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { invalid_param = true; } + return true; + } +#ifndef LOG_DISABLE_LOGS + // Parse args for logging parameters + if (log_param_single_parse(argv[i])) { + // Do nothing, log_param_single_parse automatically does it's thing + // and returns if a match was found and parsed. + return true; + } + if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { + // We have a matching known parameter requiring an argument, + // now we need to check if there is anything after this argv + // and flag invalid_param or parse it. + CHECK_ARG + if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { + invalid_param = true; + return true; + } + return true; + } + // End of Parse args for logging parameters +#endif // LOG_DISABLE_LOGS - return true; + return false; } -void common_init() { - llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { - if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) { - common_log_add(common_log_main(), level, "%s", text); - } - }, NULL); - -#ifdef NDEBUG - const char * build_type = ""; +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else - const char * build_type = " (debug)"; +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) #endif - LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); +void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + const llama_sampling_params & sparams = params.sparams; + + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; + } + sampler_type_names.pop_back(); + + struct option_info { + LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) + option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { + va_list args_list; + va_start(args_list, desc); + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), desc, args_list); + va_end(args_list); + this->desc = buffer; + } + + option_info(const std::string & grp) : grp(grp) {} + + std::string tags; + std::string args; + std::string desc; + std::string grp; + }; + + std::vector options; + + // TODO: filter by tags + + options.push_back({ "general" }); + options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); + options.push_back({ "*", " --version", "show version and build info" }); + options.push_back({ "*", "-v, --verbose", "print verbose information" }); + options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); + options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); + options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); + options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); + options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); + options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); + options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); + options.push_back({ "speculative", "-tbd, --threads-batch-draft N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); + options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); + options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)" }); + options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); + + options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); + options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); + options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); + options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); + options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); + options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); + options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); + options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" + "in conversation mode, this will be used as system prompt\n" + "(default: '%s')", params.prompt.c_str() }); + options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); + options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); + options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); + options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); + options.push_back({ "*", " --no-escape", "do not process escape sequences" }); + options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); + options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); + options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" + "not supported with --interactive or other interactive options" }); + options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); + options.push_back({ "main", "-r, --reverse-prompt PROMPT", + "halt generation at PROMPT, return control in interactive mode\n" + "can be specified more than once for multiple prompts" }); + options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n" + "if suffix/prefix are not specified, default chat template will be used\n" + "(default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); + options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); + options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); + options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); + options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); + options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "server infill", + " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); + + options.push_back({ "sampling" }); + options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" + "(default: %s)", sampler_type_names.c_str() }); + options.push_back({ "*", " --sampling-seq SEQUENCE", + "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); + options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); + options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); + options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp }); + options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); + options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); + options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); + options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); + options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p }); + options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); + options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); + options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); + options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); + options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); + options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); + options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" + "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); + options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); + options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); + options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); + options.push_back({ "main", " --cfg-negative-prompt PROMPT", + "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() }); + options.push_back({ "main", " --cfg-negative-prompt-file FNAME", + "negative prompt file to use for guidance" }); + options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); + options.push_back({ "main", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); + options.push_back({ "grammar" }); + options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); + options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); + options.push_back({ "*", "-j, --json-schema SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" + "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); + + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --pooling {none,mean,cls,last}", + "pooling type for embeddings, use model default if unspecified" }); + options.push_back({ "embedding", " --attention {causal,non-causal}", + "attention type for embeddings, use model default if unspecified" }); + + options.push_back({ "context hacking" }); + options.push_back({ "*", " --rope-scaling {none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model" }); + options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); + options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); + options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); + options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); + options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); + options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); + options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); + options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); + options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); + options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); + options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); + options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); + options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); + options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); + + options.push_back({ "perplexity" }); + options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); + options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); + options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); + options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --multiple-choice-tasks N", + "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); + options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); + options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); + options.push_back({ "perplexity", " --ppl-output-type {0,1}", + "output type for perplexity calculation (default: %d)", params.ppl_output_type }); + + options.push_back({ "parallel" }); + options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); + options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); + options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); + options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); + options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); + + options.push_back({ "multi-modality" }); + options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); + options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); + + options.push_back({ "backend" }); + options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); + + if (llama_supports_mlock()) { + options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); + } + if (llama_supports_mmap()) { + options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); + } + options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" + " - distribute: spread execution evenly over all nodes\n" + " - isolate: only spawn threads on CPUs on the node that execution started on\n" + " - numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437" }); + + if (llama_supports_gpu_offload()) { + options.push_back({ "*", "-ngl, --gpu-layers N", + "number of layers to store in VRAM" }); + options.push_back({ "*", "-ngld, --gpu-layers-draft N", + "number of layers to store in VRAM for the draft model" }); + options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", + "how to split the model across multiple GPUs, one of:\n" + " - none: use one GPU only\n" + " - layer (default): split layers and KV across GPUs\n" + " - row: split rows across GPUs" }); + options.push_back({ "*", "-ts, --tensor-split SPLIT", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); + options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" + "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); + } + + options.push_back({ "model" }); + options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); + options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" + "note: this argument can be repeated to add multiple control vectors" }); + options.push_back({ "*", " --control-vector-scaled FNAME SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors" }); + options.push_back({ "*", " --control-vector-layer-range START END", + "layer range to apply the control vector(s) to, start and end inclusive" }); + options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" + "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); + options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); + options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); + options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); + options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); + options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" }); + + options.push_back({ "retrieval" }); + options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); + options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); + options.push_back({ "retrieval", " --chunk-separator STRING", + "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); + + options.push_back({ "passkey" }); + options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); + options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); + + options.push_back({ "imatrix" }); + options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); + options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); + options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); + options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); + options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); + options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); + + options.push_back({ "bench" }); + options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); + options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); + options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); + options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); + + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); + options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); + options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); + + options.push_back({ "server" }); + options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); + options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); + options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); + options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); + options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); + options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); + options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); + options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); + options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http }); + options.push_back({ "server", " --system-prompt-file FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); + options.push_back({ "server", " --log-format {text,json}", + "log output format: json or text (default: json)" }); + options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); + options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); + options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); + options.push_back({ "server", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); + options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", + "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + +#ifndef LOG_DISABLE_LOGS + options.push_back({ "logging" }); + options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); + options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); + options.push_back({ "logging", " --log-test", "Run simple logging test" }); + options.push_back({ "logging", " --log-disable", "Disable trace logs" }); + options.push_back({ "logging", " --log-enable", "Enable trace logs" }); + options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); + options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " + "Each log file will have unique name: \"..log\"" }); + options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); +#endif // LOG_DISABLE_LOGS + + options.push_back({ "cvector" }); + options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); + options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); + options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); + options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); + + options.push_back({ "export-lora" }); + options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); + options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); + options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); + + printf("usage: %s [options]\n", argv[0]); + + for (const auto & o : options) { + if (!o.grp.empty()) { + printf("\n%s:\n\n", o.grp.c_str()); + continue; + } + printf(" %-32s", o.args.c_str()); + if (o.args.length() > 30) { + printf("\n%34s", ""); + } + + const auto desc = o.desc; + size_t start = 0; + size_t end = desc.find('\n'); + while (end != std::string::npos) { + printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); + start = end + 1; + end = desc.find('\n', start); + } + + printf("%s\n", desc.substr(start).c_str()); + } + printf("\n"); } -std::string common_params_get_system_info(const common_params & params) { +std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; - os << "system_info: n_threads = " << params.cpuparams.n_threads; - if (params.cpuparams_batch.n_threads != -1) { - os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")"; + os << "system_info: n_threads = " << params.n_threads; + if (params.n_threads_batch != -1) { + os << " (n_threads_batch = " << params.n_threads_batch << ")"; } -#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later - // TODO: windows + arm64 + mingw64 - DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); - os << " / " << logicalProcessorCount << " | " << llama_print_system_info(); -#else os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); -#endif return os.str(); } @@ -425,19 +1720,17 @@ std::string common_params_get_system_info(const common_params & params) { // String utils // -std::string string_format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; } std::string string_strip(const std::string & str) { @@ -468,153 +1761,6 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } -void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; - } - std::string builder; - builder.reserve(s.length()); - size_t pos = 0; - size_t last_pos = 0; - while ((pos = s.find(search, last_pos)) != std::string::npos) { - builder.append(s, last_pos, pos - last_pos); - builder.append(replace); - last_pos = pos + search.length(); - } - builder.append(s, last_pos, std::string::npos); - s = std::move(builder); -} - -std::string string_join(const std::vector & values, const std::string & separator) { - std::ostringstream result; - for (size_t i = 0; i < values.size(); ++i) { - if (i > 0) { - result << separator; - } - result << values[i]; - } - return result.str(); -} - -std::vector string_split(const std::string & str, const std::string & delimiter) { - std::vector parts; - size_t start = 0; - size_t end = str.find(delimiter); - - while (end != std::string::npos) { - parts.push_back(str.substr(start, end - start)); - start = end + delimiter.length(); - end = str.find(delimiter, start); - } - - parts.push_back(str.substr(start)); - - return parts; -} - -std::string string_repeat(const std::string & str, size_t n) { - if (n == 0) { - return ""; - } - - std::string result; - result.reserve(str.length() * n); - - for (size_t i = 0; i < n; ++i) { - result += str; - } - - return result; -} - -std::string string_from(bool value) { - return value ? "true" : "false"; -} - -std::string string_from(const std::vector & values) { - std::stringstream buf; - - buf << "[ "; - bool first = true; - for (auto e : values) { - if (first) { - first = false; - } else { - buf << ", "; - } - buf << std::to_string(e); - } - buf << " ]"; - - return buf.str(); -} - -std::string string_from(const struct llama_context * ctx, const std::vector & tokens) { - std::stringstream buf; - - buf << "[ "; - - bool first = true; - for (const auto & token : tokens) { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = common_token_to_piece(ctx, token); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf << "'" << detokenized << "'" - << ":" << std::to_string(token); - } - - buf << " ]"; - - return buf.str(); -} - -std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) { - std::stringstream buf; - - buf << "[ "; - - bool first = true; - for (int i = 0; i < batch.n_tokens; ++i) { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = common_token_to_piece(ctx, batch.token[i]); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf << "\n" << std::to_string(i) - << ", token '" << detokenized << "'" - << ", pos " << std::to_string(batch.pos[i]) - << ", n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ", seq_id " << std::to_string(batch.seq_id[i][0]) - << ", logits " << std::to_string(batch.logits[i]); - } - - buf << " ]"; - - return buf.str(); -} - void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -655,7 +1801,7 @@ void string_process_escapes(std::string & input) { bool string_parse_kv_override(const char * data, std::vector & overrides) { const char * sep = strchr(data, '='); if (sep == nullptr || sep - data >= 128) { - LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); return false; } llama_model_kv_override kvo; @@ -678,20 +1824,20 @@ bool string_parse_kv_override(const char * data, std::vector 127) { - LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); return false; } strncpy(kvo.val_str, sep, 127); kvo.val_str[127] = '\0'; } else { - LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data); + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); return false; } overrides.emplace_back(std::move(kvo)); @@ -718,17 +1864,7 @@ bool fs_validate_filename(const std::string & filename) { std::u32string filename_utf32; try { -#if defined(__clang__) - // disable C++17 deprecation warning for std::codecvt_utf8 -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wdeprecated-declarations" -#endif std::wstring_convert, char32_t> converter; - -#if defined(__clang__) -# pragma clang diagnostic pop -#endif - filename_utf32 = converter.from_bytes(filename); // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used, @@ -898,195 +2034,112 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // -struct common_init_result common_init_from_params(common_params & params) { - common_init_result iparams; - auto mparams = common_model_params_to_llama(params); + +std::tuple llama_init_from_gpt_params(gpt_params & params) { + auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams); + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else if (!params.model_url.empty()) { - model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else { - model = llama_model_load_from_file(params.model.c_str(), mparams); + model = llama_load_model_from_file(params.model.c_str(), mparams); } if (model == NULL) { - LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); - return iparams; + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return std::make_tuple(nullptr, nullptr); } - const llama_vocab * vocab = llama_model_get_vocab(model); + auto cparams = llama_context_params_from_gpt_params(params); - if (params.reranking) { - bool ok = true; - - if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__); - ok = false; - } - - if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__); - ok = false; - } - - if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); - ok = false; - } - - if (!ok) { - llama_model_free(model); - - return iparams; - } - } - - auto cparams = common_context_params_to_llama(params); - - llama_context * lctx = llama_init_from_model(model, cparams); + llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { - LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_model_free(model); - return iparams; - } - - if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { - LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); - params.ctx_shift = false; + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } if (!params.control_vectors.empty()) { if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; - if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model); + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); - const auto cvec = common_control_vector_load(params.control_vectors); + const auto cvec = llama_control_vector_load(params.control_vectors); if (cvec.n_embd == -1) { llama_free(lctx); - llama_model_free(model); - - return iparams; + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } - int err = llama_apply_adapter_cvec( - lctx, - cvec.data.data(), - cvec.data.size(), - cvec.n_embd, - params.control_vector_layer_start, - params.control_vector_layer_end); + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); if (err) { llama_free(lctx); - llama_model_free(model); - - return iparams; + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } } - // load and optionally apply lora adapters - for (auto & la : params.lora_adapters) { - llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); - if (lora == nullptr) { - LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); + if (adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); - llama_model_free(model); - return iparams; + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } - - la.ptr = lora.get(); - iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + llama_lora_adapter_set(lctx, adapter, lora_scale); } - if (!params.lora_init_without_apply) { - common_set_adapter_lora(lctx, params.lora_adapters); - } - - if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); - params.sampling.ignore_eos = false; - } - - if (params.sampling.ignore_eos) { - for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { - if (llama_vocab_is_eog(vocab, i)) { - LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); - params.sampling.logit_bias.push_back({i, -INFINITY}); - } - } - } - - if (params.sampling.penalty_last_n == -1) { - LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); - params.sampling.penalty_last_n = llama_n_ctx(lctx); - } - - if (params.sampling.dry_penalty_last_n == -1) { - LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); - params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); + if (params.ignore_eos) { + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } if (params.warmup) { - LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + LOG("warming up the model with an empty run\n"); std::vector tmp; - llama_token bos = llama_vocab_bos(vocab); - llama_token eos = llama_vocab_eos(vocab); - + llama_token bos = llama_token_bos(model); + llama_token eos = llama_token_eos(model); // some models (e.g. T5) don't have a BOS token - if (bos != LLAMA_TOKEN_NULL) { + if (bos != -1) { tmp.push_back(bos); } - if (eos != LLAMA_TOKEN_NULL) { - tmp.push_back(eos); - } - if (tmp.empty()) { - tmp.push_back(0); - } + tmp.push_back(eos); if (llama_model_has_encoder(model)) { - llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); + llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + if (decoder_start_token_id == -1) { decoder_start_token_id = bos; } tmp.clear(); tmp.push_back(decoder_start_token_id); } - if (llama_model_has_decoder(model)) { - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); - } + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_context_reset(lctx); + llama_reset_timings(lctx); } - iparams.model.reset(model); - iparams.context.reset(lctx); - - return iparams; + return std::make_tuple(model, lctx); } -void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { - llama_clear_adapter_lora(ctx); - for (auto & la : lora) { - if (la.scale != 0.0f) { - llama_set_adapter_lora(ctx, la.ptr, la.scale); - } - } -} - -struct llama_model_params common_model_params_to_llama(common_params & params) { +struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { auto mparams = llama_model_default_params(); - if (!params.devices.empty()) { - mparams.devices = params.devices.data(); - } if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; @@ -1103,16 +2156,45 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { return mparams; } -struct llama_context_params common_context_params_to_llama(const common_params & params) { +static ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return GGML_TYPE_F32; + } + if (s == "f16") { + return GGML_TYPE_F16; + } + if (s == "q8_0") { + return GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return GGML_TYPE_Q4_1; + } + if (s == "iq4_nl") { + return GGML_TYPE_IQ4_NL; + } + if (s == "q5_0") { + return GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return GGML_TYPE_Q5_1; + } + + throw std::runtime_error("Invalid cache type: " + s); +} + +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); cparams.n_ctx = params.n_ctx; cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; - cparams.n_threads = params.cpuparams.n_threads; - cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? - params.cpuparams.n_threads : params.cpuparams_batch.n_threads; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; @@ -1130,69 +2212,26 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; - cparams.no_perf = params.no_perf; - if (params.reranking) { - cparams.embeddings = true; - cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; - } - - cparams.type_k = params.cache_type_k; - cparams.type_v = params.cache_type_v; + cparams.type_k = kv_cache_type_from_str(params.cache_type_k); + cparams.type_v = kv_cache_type_from_str(params.cache_type_v); return cparams; } -struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) { - struct ggml_threadpool_params tpp; - - ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults - - if (params.mask_valid) { - std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS); - } - - tpp.prio = params.priority; - tpp.poll = params.poll; - tpp.strict_cpu = params.strict_cpu; - - return tpp; -} - #ifdef LLAMA_USE_CURL -#define CURL_MAX_RETRY 3 -#define CURL_RETRY_DELAY_SECONDS 2 - -static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { - int remaining_attempts = max_attempts; - - while (remaining_attempts > 0) { - LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); - - CURLcode res = curl_easy_perform(curl); - if (res == CURLE_OK) { - return true; - } - - int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; - LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); - - remaining_attempts--; - std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); - } - - LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); - - return false; +static bool starts_with(const std::string & str, const std::string & prefix) { + // While we wait for C++20's std::string::starts_with... + return str.rfind(prefix, 0) == 0; } -static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { +static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { + // Initialize libcurl - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - curl_slist_ptr http_headers; + std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { - LOG_ERR("%s: error initializing libcurl\n", __func__); + fprintf(stderr, "%s: error initializing libcurl\n", __func__); return false; } @@ -1204,9 +2243,11 @@ static bool common_download_file(const std::string & url, const std::string & pa // Check if hf-token or bearer-token was specified if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + std::string auth_header = "Authorization: Bearer "; + auth_header += hf_token.c_str(); + struct curl_slist *http_headers = NULL; + http_headers = curl_slist_append(http_headers, auth_header.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers); } #if defined(_WIN32) @@ -1216,7 +2257,8 @@ static bool common_download_file(const std::string & url, const std::string & pa #endif // Check if the file already exists locally - auto file_exists = std::filesystem::exists(path); + struct stat model_file_info; + auto file_exists = (stat(path.c_str(), &model_file_info) == 0); // If the file exists, check its JSON metadata companion file. std::string metadata_path = path + ".json"; @@ -1230,11 +2272,11 @@ static bool common_download_file(const std::string & url, const std::string & pa if (metadata_in.good()) { try { metadata_in >> metadata; - LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); if (metadata.contains("url") && metadata.at("url").is_string()) { auto previous_url = metadata.at("url").get(); if (previous_url != url) { - LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); return false; } } @@ -1245,26 +2287,24 @@ static bool common_download_file(const std::string & url, const std::string & pa last_modified = metadata.at("lastModified"); } } catch (const nlohmann::json::exception & e) { - LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); return false; } } } else { - LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); } // Send a HEAD request to retrieve the etag and last-modified headers - struct common_load_model_from_url_headers { + struct llama_load_model_from_url_headers { std::string etag; std::string last_modified; }; - - common_load_model_from_url_headers headers; - + llama_load_model_from_url_headers headers; { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; + llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); @@ -1289,8 +2329,9 @@ static bool common_download_file(const std::string & url, const std::string & pa curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); - if (!was_perform_successful) { + CURLcode res = curl_easy_perform(curl.get()); + if (res != CURLE_OK) { + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); return false; } @@ -1300,26 +2341,26 @@ static bool common_download_file(const std::string & url, const std::string & pa // HEAD not supported, we don't know if the file has changed // force trigger downloading force_download = true; - LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } bool should_download = !file_exists || force_download; if (!should_download) { if (!etag.empty() && etag != headers.etag) { - LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); should_download = true; } else if (!last_modified.empty() && last_modified != headers.last_modified) { - LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); should_download = true; } } if (should_download) { std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { - LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); if (remove(path.c_str()) != 0) { - LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); return false; } } @@ -1334,7 +2375,7 @@ static bool common_download_file(const std::string & url, const std::string & pa std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { - LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; } @@ -1365,17 +2406,18 @@ static bool common_download_file(const std::string & url, const std::string & pa }; // start the download - LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); - bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); - if (!was_perform_successful) { + fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + auto res = curl_easy_perform(curl.get()); + if (res != CURLE_OK) { + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); return false; } long http_code = 0; curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code < 200 || http_code >= 400) { - LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); + fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); return false; } @@ -1389,10 +2431,10 @@ static bool common_download_file(const std::string & url, const std::string & pa {"lastModified", headers.last_modified} }); std::ofstream(metadata_path) << metadata.dump(4); - LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); if (rename(path_temporary.c_str(), path.c_str()) != 0) { - LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); return false; } } @@ -1400,18 +2442,18 @@ static bool common_download_file(const std::string & url, const std::string & pa return true; } -struct llama_model * common_load_model_from_url( - const std::string & model_url, - const std::string & local_path, - const std::string & hf_token, +struct llama_model * llama_load_model_from_url( + const char * model_url, + const char * path_model, + const char * hf_token, const struct llama_model_params & params) { // Basic validation of the model_url - if (model_url.empty()) { - LOG_ERR("%s: invalid model_url\n", __func__); + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); return NULL; } - if (!common_download_file(model_url, local_path, hf_token)) { + if (!llama_download_file(model_url, path_model, hf_token)) { return NULL; } @@ -1422,9 +2464,9 @@ struct llama_model * common_load_model_from_url( /*.no_alloc = */ true, /*.ctx = */ NULL, }; - auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); + auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); if (!ctx_gguf) { - LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str()); + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); return NULL; } @@ -1443,13 +2485,15 @@ struct llama_model * common_load_model_from_url( // Verify the first split file format // and extract split URL and PATH prefixes { - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) { - LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split); + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model file name: %s" + " n_split=%d\n", __func__, path_model, n_split); return NULL; } - if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) { - LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split); + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model url: %s" + " n_split=%d\n", __func__, model_url, n_split); return NULL; } } @@ -1464,7 +2508,7 @@ struct llama_model * common_load_model_from_url( char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); - return common_download_file(split_url, split_path, hf_token); + return llama_download_file(split_url, split_path, hf_token); }, idx)); } @@ -1476,14 +2520,14 @@ struct llama_model * common_load_model_from_url( } } - return llama_model_load_from_file(local_path.c_str(), params); + return llama_load_model_from_file(path_model, params); } -struct llama_model * common_load_model_from_hf( - const std::string & repo, - const std::string & remote_path, - const std::string & local_path, - const std::string & hf_token, +struct llama_model * llama_load_model_from_hf( + const char * repo, + const char * model, + const char * path_model, + const char * hf_token, const struct llama_model_params & params) { // construct hugging face model url: // @@ -1497,129 +2541,48 @@ struct llama_model * common_load_model_from_hf( std::string model_url = "https://huggingface.co/"; model_url += repo; model_url += "/resolve/main/"; - model_url += remote_path; + model_url += model; - return common_load_model_from_url(model_url, local_path, hf_token, params); -} - -/** - * Allow getting the HF file from the HF repo with tag (like ollama), for example: - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 - * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s - * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) - * - * Return pair of (with "repo" already having tag removed) - * - * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. - */ -std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { - auto parts = string_split(hf_repo_with_tag, ':'); - std::string tag = parts.size() > 1 ? parts.back() : "latest"; - std::string hf_repo = parts[0]; - if (string_split(hf_repo, '/').size() != 2) { - throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); - } - - // fetch model info from Hugging Face Hub API - json model_info; - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - curl_slist_ptr http_headers; - std::string res_str; - std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; - curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); - typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); - auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { - static_cast(data)->append((char * ) ptr, size * nmemb); - return size * nmemb; - }; - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); -#if defined(_WIN32) - curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); -#endif - if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - } - // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response - http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); - http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); - - CURLcode res = curl_easy_perform(curl.get()); - - if (res != CURLE_OK) { - throw std::runtime_error("error: cannot make GET request to HF API"); - } - - long res_code; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); - if (res_code == 200) { - model_info = json::parse(res_str); - } else if (res_code == 401) { - throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); - } else { - throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); - } - - // check response - if (!model_info.contains("ggufFile")) { - throw std::runtime_error("error: model does not have ggufFile"); - } - json & gguf_file = model_info.at("ggufFile"); - if (!gguf_file.contains("rfilename")) { - throw std::runtime_error("error: ggufFile does not have rfilename"); - } - - return std::make_pair(hf_repo, gguf_file.at("rfilename")); + return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params); } #else -struct llama_model * common_load_model_from_url( - const std::string & /*model_url*/, - const std::string & /*local_path*/, - const std::string & /*hf_token*/, +struct llama_model * llama_load_model_from_url( + const char * /*model_url*/, + const char * /*path_model*/, + const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } -struct llama_model * common_load_model_from_hf( - const std::string & /*repo*/, - const std::string & /*remote_path*/, - const std::string & /*local_path*/, - const std::string & /*hf_token*/, +struct llama_model * llama_load_model_from_hf( + const char * /*repo*/, + const char * /*model*/, + const char * /*path_model*/, + const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; } -std::pair common_get_hf_file(const std::string &, const std::string &) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); - return std::make_pair("", ""); -} - #endif // LLAMA_USE_CURL // // Batch utils // -void common_batch_clear(struct llama_batch & batch) { +void llama_batch_clear(struct llama_batch & batch) { batch.n_tokens = 0; } -void common_batch_add( +void llama_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, const std::vector & seq_ids, bool logits) { - GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); - batch.token [batch.n_tokens] = id; batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); @@ -1631,92 +2594,30 @@ void common_batch_add( batch.n_tokens++; } -// -// Token utils -// - -size_t common_lcp(const llama_tokens & a, const llama_tokens & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - - return i; -} - -size_t common_lcs(const llama_tokens & a, const llama_tokens & b) { - // check for empty sequences - if (a.empty() || b.empty()) { - return 0; - } - - // get the lengths of the input sequences - size_t a_len = a.size(); - size_t b_len = b.size(); - - // initialize the maximum length of the longest common subsequence (LCS) - size_t max_length = 0; - - // use two rows instead of a 2D matrix to optimize space - std::vector prev_row(b_len + 1, 0); - std::vector curr_row(b_len + 1, 0); - - // iterate through the elements of a - for (size_t i = 1; i <= a_len; i++) { - // iterate through the elements of b - for (size_t j = 1; j <= b_len; j++) { - // if elements at the current positions match - if (a[i - 1] == b[j - 1]) { - // if it's the first element of either sequences, set LCS length to 1 - if (i == 1 || j == 1) { - curr_row[j] = 1; - } else { - // increment LCS length by 1 compared to the previous element - curr_row[j] = prev_row[j - 1] + 1; - } - - // update max_length if necessary - if (curr_row[j] > max_length) { - max_length = curr_row[j]; - } - } else { - // reset LCS length if elements don't match - curr_row[j] = 0; - } - } - - // update the previous row for the next iteration - prev_row = curr_row; - } - - // return the maximum length of the LCS - return max_length; -} - // // Vocab utils // -std::vector common_tokenize( +std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - return common_tokenize(vocab, text, add_special, parse_special); + return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); } -std::vector common_tokenize( - const struct llama_vocab * vocab, +std::vector llama_tokenize( + const struct llama_model * model, const std::string & text, bool add_special, bool parse_special) { // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); - n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -1724,19 +2625,13 @@ std::vector common_tokenize( return result; } -std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - return common_token_to_piece(vocab, token, special); -} - -std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' - const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); + const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); if (n_chars < 0) { piece.resize(-n_chars); - int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); + int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); GGML_ASSERT(check == -n_chars); } else { @@ -1746,19 +2641,13 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token return piece; } -std::string common_detokenize(const struct llama_context * ctx, const std::vector & tokens, bool special) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - return common_detokenize(vocab, tokens, special); -} - -std::string common_detokenize(const struct llama_vocab * vocab, const std::vector & tokens, bool special) { +std::string llama_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); - int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); if (n_chars < 0) { text.resize(-n_chars); - n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } @@ -1768,179 +2657,102 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto return text; } +bool llama_should_add_bos_token(const llama_model * model) { + const int add_bos = llama_add_bos_token(model); + + return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); +} + // // Chat template utils // -bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { - if (use_jinja) { - try { - auto chat_template = common_chat_template(tmpl, "", ""); - common_chat_inputs inputs; - inputs.messages = json::array({{ - {"role", "user"}, - {"content", "test"}, - }}); - common_chat_params_init(chat_template, inputs); - return true; - } catch (const std::exception & e) { - LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what()); - return false; - } - } +bool llama_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; - const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0); + int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } -std::string common_chat_apply_template( - const common_chat_template & tmpl, - const std::vector & msgs, - bool add_ass, - bool use_jinja) { - if (use_jinja) { - auto messages = json::array(); - for (const auto & msg : msgs) { - messages.push_back({{"role", msg.role}, {"content", msg.content}}); - } - common_chat_inputs inputs; - inputs.messages = messages; - inputs.add_generation_prompt = add_ass; - return common_chat_params_init(tmpl, inputs).prompt; - } - +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector & msgs, + bool add_ass) { int alloc_size = 0; + bool fallback = false; // indicate if we must fallback to default chatml std::vector chat; - for (const auto & msg : msgs) { + for (auto & msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); alloc_size += (msg.role.size() + msg.content.size()) * 1.25; } + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); std::vector buf(alloc_size); // run the first time to get the total output length - int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); // error: chat template is not supported if (res < 0) { - // if the custom "tmpl" is not supported, we throw an error - // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() - throw std::runtime_error("this custom template is not supported"); + if (ptr_tmpl != nullptr) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported"); + } else { + // If the built-in template is not supported, we default to chatml + res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + fallback = true; + } } // if it turns out that our buffer is too small, we resize it if ((size_t) res > buf.size()) { buf.resize(res); - res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + res = llama_chat_apply_template( + fallback ? nullptr : model, + fallback ? "chatml" : ptr_tmpl, + chat.data(), chat.size(), add_ass, buf.data(), buf.size()); } std::string formatted_chat(buf.data(), res); return formatted_chat; } -std::string common_chat_format_single( - const common_chat_template & tmpl, - const std::vector & past_msg, - const common_chat_msg & new_msg, - bool add_ass, - bool use_jinja) { +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja); - std::vector chat_new(past_msg); + auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { ss << "\n"; }; // format chat with new_msg chat_new.push_back(new_msg); - auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja); + auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); // get the diff part ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); return ss.str(); } -std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) { - std::vector msgs = { - {"system", "You are a helpful assistant", {}}, - {"user", "Hello", {}}, - {"assistant", "Hi there", {}}, - {"user", "How are you?", {}}, +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl) { + std::vector msgs = { + {"system", "You are a helpful assistant"}, + {"user", "Hello"}, + {"assistant", "Hi there"}, + {"user", "How are you?"}, }; - return common_chat_apply_template(tmpl, msgs, true, use_jinja); -} - -#define CHATML_TEMPLATE_SRC \ - "{%- for message in messages -%}\n" \ - " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \ - "{%- endfor -%}\n" \ - "{%- if add_generation_prompt -%}\n" \ - " {{- '<|im_start|>assistant\n' -}}\n" \ - "{%- endif -%}" - -common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override) -{ - std::string default_template_src; - std::string template_tool_use_src; - - bool has_explicit_template = !chat_template_override.empty(); - if (chat_template_override.empty()) { - auto str = llama_model_chat_template(model, /* name */ nullptr); - if (str) { - default_template_src = str; - has_explicit_template = true; - } - str = llama_model_chat_template(model, /* name */ "tool_use"); - if (str) { - template_tool_use_src = str; - has_explicit_template = true; - } - } else { - default_template_src = chat_template_override; - } - if (default_template_src.empty() || default_template_src == "chatml") { - if (!template_tool_use_src.empty()) { - default_template_src = template_tool_use_src; - } else { - default_template_src = CHATML_TEMPLATE_SRC; - } - } - auto vocab = llama_model_get_vocab(model); - const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) { - if (token == LLAMA_TOKEN_NULL) { - if (default_template_src.find(jinja_variable_name) != std::string::npos - || template_tool_use_src.find(jinja_variable_name) != std::string::npos) { - LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name); - } - return std::string(); - } else { - return common_token_to_piece(vocab, token, true); - } - }; - auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); - auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); - try { - return { - has_explicit_template, - std::make_unique(default_template_src, token_bos, token_eos), - template_tool_use_src.empty() - ? nullptr - : std::make_unique(template_tool_use_src, token_bos, token_eos), - }; - } catch (const std::exception & e) { - LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what()); - return { - has_explicit_template, - std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos), - nullptr, - }; - } + return llama_chat_apply_template(model, tmpl, msgs, true); } // // KV cache utils // -void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { +void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", @@ -1963,7 +2775,7 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } -void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { +void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", @@ -2015,7 +2827,7 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si // Embedding utils // -void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) { +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; switch (embd_norm) { @@ -2024,9 +2836,7 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) break; case 0: // max absolute for (int i = 0; i < n; i++) { - if (sum < std::abs(inp[i])) { - sum = std::abs(inp[i]); - } + if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); } sum /= 32760.0; // make an int16 range break; @@ -2051,7 +2861,7 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) } } -float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){ +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ double sum = 0.0; double sum1 = 0.0; double sum2 = 0.0; @@ -2077,8 +2887,8 @@ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n // Control vector utils // -static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) { - common_control_vector_data result = { -1, {} }; +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + llama_control_vector_data result = { -1, {} }; ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -2087,13 +2897,13 @@ static common_control_vector_data common_control_vector_load_one(const common_co }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { - LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); return result; } int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { - LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); } for (int i = 0; i < n_tensors; i++) { @@ -2111,23 +2921,23 @@ static common_control_vector_data common_control_vector_load_one(const common_co } } if (layer_idx < 0) { - LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } else if (layer_idx == 0) { - LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); if (tensor->type != GGML_TYPE_F32) { - LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } if (ggml_n_dims(tensor) != 1) { - LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -2135,7 +2945,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co if (result.n_embd == -1) { result.n_embd = ggml_nelements(tensor); } else if (ggml_nelements(tensor) != result.n_embd) { - LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -2152,7 +2962,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co } if (result.n_embd == -1) { - LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); result.data.clear(); } @@ -2162,18 +2972,18 @@ static common_control_vector_data common_control_vector_load_one(const common_co return result; } -common_control_vector_data common_control_vector_load(const std::vector & load_infos) { - common_control_vector_data result = { -1, {} }; +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; for (const auto & info : load_infos) { - auto cur = common_control_vector_load_one(info); + auto cur = llama_control_vector_load_one(info); if (cur.n_embd == -1) { result.n_embd = -1; break; } if (result.n_embd != -1 && result.n_embd != cur.n_embd) { - LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str()); + fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str()); result.n_embd = -1; break; } @@ -2189,10 +2999,227 @@ common_control_vector_data common_control_vector_load(const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%e, ", data[i]); + } + fprintf(stream, "%e]\n", data.back()); +} + +void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%d, ", data[i]); + } + fprintf(stream, "%d]\n", data.back()); +} + +void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) { + std::string data_str(data == NULL ? "" : data); + + if (data_str.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + size_t pos_start = 0; + size_t pos_found = 0; + + if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { + data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); + data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); + data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); + data_str = "\"" + data_str + "\""; + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + if (data_str.find('\n') == std::string::npos) { + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + fprintf(stream, "%s: |\n", prop_name); + while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { + fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); + pos_start = pos_found + 1; + } +} + +void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + const llama_sampling_params & sparams = params.sparams; + + fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); + fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); + fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); + +#ifdef NDEBUG + fprintf(stream, "debug: false\n"); +#else + fprintf(stream, "debug: true\n"); +#endif // NDEBUG + + fprintf(stream, "model_desc: %s\n", model_desc); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); + +#ifdef __OPTIMIZE__ + fprintf(stream, "optimize: true\n"); +#else + fprintf(stream, "optimize: false\n"); +#endif // __OPTIMIZE__ + + fprintf(stream, "time: %s\n", timestamp.c_str()); + + fprintf(stream, "\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "# User Inputs #\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "\n"); + + fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); + fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); + yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); + fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); + fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); + fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); + fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); + fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); + yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str()); + fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); + fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); + + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); + const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; + fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); + + yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); + fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); + yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); + fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); + fprintf(stream, "keep: %d # default: 0\n", params.n_keep); + fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); + + fprintf(stream, "logit_bias:\n"); + for (std::pair lb : sparams.logit_bias) { + if (ignore_eos && lb.first == logit_bias_eos->first) { + continue; + } + fprintf(stream, " %d: %f", lb.first, lb.second); + } + + fprintf(stream, "lora:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) != 1.0f) { + continue; + } + fprintf(stream, " - %s\n", std::get<0>(la).c_str()); + } + fprintf(stream, "lora_scaled:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) == 1.0f) { + continue; + } + fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); + } + fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); + fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); + fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); + fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); + fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); + fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); + fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); + fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); + fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); + fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); + fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); + fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); + fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); + fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); + fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); + yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str()); + fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); + fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); + fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); + yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); + + fprintf(stream, "reverse_prompt:\n"); + for (std::string ap : params.antiprompt) { + size_t pos = 0; + while ((pos = ap.find('\n', pos)) != std::string::npos) { + ap.replace(pos, 1, "\\n"); + pos += 1; + } + + fprintf(stream, " - %s\n", ap.c_str()); + } + + fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); + fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); + fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); + fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); + fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); + + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); + yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); + + fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); + fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); + fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); +} diff --git a/common/common.h b/common/common.h index b208d0c7e..8240ff99b 100644 --- a/common/common.h +++ b/common/common.h @@ -2,12 +2,20 @@ #pragma once -#include "llama-cpp.h" +#include "llama.h" -#include +#include "sampling.h" + +#define LOG_NO_FILE_LINE_FUNCTION +#include "log.h" + +#include #include #include -#include +#include +#include +#include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -25,192 +33,52 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" -struct common_adapter_lora_info { - std::string path; - float scale; - - struct llama_adapter_lora * ptr; -}; - -using llama_tokens = std::vector; - // build info extern int LLAMA_BUILD_NUMBER; -extern const char * LLAMA_COMMIT; -extern const char * LLAMA_COMPILER; -extern const char * LLAMA_BUILD_TARGET; +extern char const * LLAMA_COMMIT; +extern char const * LLAMA_COMPILER; +extern char const * LLAMA_BUILD_TARGET; -struct common_control_vector_load_info; +struct llama_control_vector_load_info; // // CPU utils // -struct cpu_params { - int n_threads = -1; - bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. - bool mask_valid = false; // Default: any CPU - enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) - bool strict_cpu = false; // Use strict CPU placement - uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) -}; - int32_t cpu_get_num_physical_cores(); int32_t cpu_get_num_math(); // -// Common params +// CLI argument parsing // -enum llama_example { - LLAMA_EXAMPLE_COMMON, - LLAMA_EXAMPLE_SPECULATIVE, - LLAMA_EXAMPLE_MAIN, - LLAMA_EXAMPLE_INFILL, - LLAMA_EXAMPLE_EMBEDDING, - LLAMA_EXAMPLE_PERPLEXITY, - LLAMA_EXAMPLE_RETRIEVAL, - LLAMA_EXAMPLE_PASSKEY, - LLAMA_EXAMPLE_IMATRIX, - LLAMA_EXAMPLE_BENCH, - LLAMA_EXAMPLE_SERVER, - LLAMA_EXAMPLE_CVECTOR_GENERATOR, - LLAMA_EXAMPLE_EXPORT_LORA, - LLAMA_EXAMPLE_LLAVA, - LLAMA_EXAMPLE_LOOKUP, - LLAMA_EXAMPLE_PARALLEL, - LLAMA_EXAMPLE_TTS, - - LLAMA_EXAMPLE_COUNT, -}; - -enum common_sampler_type { - COMMON_SAMPLER_TYPE_NONE = 0, - COMMON_SAMPLER_TYPE_DRY = 1, - COMMON_SAMPLER_TYPE_TOP_K = 2, - COMMON_SAMPLER_TYPE_TOP_P = 3, - COMMON_SAMPLER_TYPE_MIN_P = 4, - //COMMON_SAMPLER_TYPE_TFS_Z = 5, - COMMON_SAMPLER_TYPE_TYPICAL_P = 6, - COMMON_SAMPLER_TYPE_TEMPERATURE = 7, - COMMON_SAMPLER_TYPE_XTC = 8, - COMMON_SAMPLER_TYPE_INFILL = 9, - COMMON_SAMPLER_TYPE_PENALTIES = 10, -}; - // dimensionality reduction methods, used by cvector-generator enum dimre_method { DIMRE_METHOD_PCA, DIMRE_METHOD_MEAN, }; -enum common_conversation_mode { - COMMON_CONVERSATION_MODE_DISABLED = 0, - COMMON_CONVERSATION_MODE_ENABLED = 1, - COMMON_CONVERSATION_MODE_AUTO = 2, -}; +struct gpt_params { + uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed -struct common_grammar_trigger { - std::string word; - bool at_start; -}; - -// sampling parameters -struct common_params_sampling { - uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; - - std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY - - - std::vector samplers = { - COMMON_SAMPLER_TYPE_PENALTIES, - COMMON_SAMPLER_TYPE_DRY, - COMMON_SAMPLER_TYPE_TOP_K, - COMMON_SAMPLER_TYPE_TYPICAL_P, - COMMON_SAMPLER_TYPE_TOP_P, - COMMON_SAMPLER_TYPE_MIN_P, - COMMON_SAMPLER_TYPE_XTC, - COMMON_SAMPLER_TYPE_TEMPERATURE, - }; - - std::string grammar; // optional BNF-like grammar to constrain sampling - bool grammar_lazy = false; - std::vector grammar_trigger_words; // optional trigger words to trigger lazy grammar - std::vector grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens. - std::set preserved_tokens; - - std::vector logit_bias; // logit biases to apply - - // print the parameters into a string - std::string print() const; -}; - -struct common_params_speculative { - std::vector devices; // devices to use for offloading - - int32_t n_ctx = 0; // draft context size - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding - int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.9f; // minimum speculative decoding probability (greedy) - - struct cpu_params cpuparams; - struct cpu_params cpuparams_batch; - - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT - - std::string model = ""; // draft model for speculative decoding // NOLINT - std::string model_url = ""; // model url to download // NOLINT -}; - -struct common_params_vocoder { - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT - - std::string model = ""; // model path // NOLINT - std::string model_url = ""; // model url to download // NOLINT - - bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT -}; - -struct common_params { + int32_t n_threads = cpu_get_num_math(); + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size + int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled) @@ -221,56 +89,47 @@ struct common_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - float defrag_thold = 0.1f; // KV cache defragmentation threshold - - // offload params - std::vector devices; // devices to use for offloading - - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - - struct cpu_params cpuparams; - struct cpu_params cpuparams_batch; + float defrag_thold = -1.0f; // KV cache defragmentation threshold ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_params_sampling sampling; - struct common_params_speculative speculative; - struct common_params_vocoder vocoder; + // // sampling parameters + struct llama_sampling_params sparams; - std::string model = ""; // model path // NOLINT - std::string model_alias = ""; // model alias // NOLINT - std::string model_url = ""; // model url to download // NOLINT - std::string hf_token = ""; // HF token // NOLINT - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT - std::string prompt = ""; // NOLINT - std::string prompt_file = ""; // store the external prompt file name // NOLINT - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT - std::string input_prefix = ""; // string to prefix user inputs with // NOLINT - std::string input_suffix = ""; // string to suffix user inputs with // NOLINT - std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT - std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT - std::string logits_file = ""; // file for saving *all* logits // NOLINT + std::string model = ""; // model path + std::string model_draft = ""; // draft model for speculative decoding + std::string model_alias = "unknown"; // model alias + std::string model_url = ""; // model url to download + std::string hf_token = ""; // HF token + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file + std::string prompt = ""; + std::string prompt_file = ""; // store the external prompt file name + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with + std::string logdir = ""; // directory in which to save YAML log files + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding + std::string logits_file = ""; // file for saving *all* logits + std::string rpc_servers = ""; // comma separated list of RPC servers std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) - std::vector lora_adapters; // lora adapter path with user defined scale + // TODO: avoid tuple, use struct + std::vector> lora_adapter; // lora adapter path with user defined scale - std::vector control_vectors; // control vector with user defined scale + std::vector control_vectors; // control vector with user defined scale int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector @@ -296,6 +155,7 @@ struct common_params { bool special = false; // enable special token output bool interactive = false; // interactive mode bool interactive_first = false; // wait for user input immediately + bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it @@ -304,58 +164,51 @@ struct common_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention - bool no_perf = false; // disable performance metrics - bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix + bool ignore_eos = false; // ignore generated EOS tokens bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation + bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data - ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K - ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V - - common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; + std::string cache_type_k = "f16"; // KV cache data type for the K + std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) - std::string mmproj = ""; // path to multimodal projector // NOLINT + std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) // embedding bool embedding = false; // get only sentence embedding - int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix - std::string embd_sep = "\n"; // separator of embeddings - bool reranking = false; // enable reranking support on server + std::string embd_sep = "\n"; // separator of embendings // server params int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) - int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting + int32_t n_threads_http = -1; // number of threads to process HTTP requests std::string hostname = "127.0.0.1"; - std::string public_path = ""; // NOLINT - std::string chat_template = ""; // NOLINT - bool use_jinja = false; // NOLINT + std::string public_path = ""; + std::string chat_template = ""; + std::string system_prompt = ""; bool enable_chat_template = true; std::vector api_keys; - std::string ssl_file_key = ""; // NOLINT - std::string ssl_file_cert = ""; // NOLINT + std::string ssl_file_key = ""; + std::string ssl_file_cert = ""; - // "advanced" endpoints are disabled by default for better security - bool webui = true; - bool endpoint_slots = false; - bool endpoint_props = false; // only control POST requests, not GET + bool endpoint_slots = true; bool endpoint_metrics = false; bool log_json = false; @@ -403,51 +256,29 @@ struct common_params { bool spm_infill = false; // suffix/prefix/middle pattern for infill std::string lora_outfile = "ggml-lora-merged-f16.gguf"; - - // batched-bench params - bool batched_bench_output_jsonl = false; }; -// call once at the start of a program if it uses libcommon -// initializes the logging system and prints info about the build -void common_init(); +void gpt_params_handle_hf_token(gpt_params & params); +void gpt_params_handle_model_default(gpt_params & params); -std::string common_params_get_system_info(const common_params & params); +bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); +bool gpt_params_parse (int argc, char ** argv, gpt_params & params); +bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); +void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); -bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); -bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); -bool set_process_priority(enum ggml_sched_priority prio); +std::string gpt_params_get_system_info(const gpt_params & params); // // String utils // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -std::string string_format(const char * fmt, ...); +std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); -std::string string_join(const std::vector & values, const std::string & separator); -std::vector string_split(const std::string & str, const std::string & delimiter); -std::string string_repeat(const std::string & str, size_t n); - -void string_replace_all(std::string & s, const std::string & search, const std::string & replace); - template static std::vector string_split(const std::string & str, char delim) { - static_assert(!std::is_same::value, "Please use the specialized version for std::string"); std::vector values; std::istringstream str_stream(str); std::string token; @@ -460,40 +291,9 @@ static std::vector string_split(const std::string & str, char delim) { return values; } -template<> -std::vector string_split(const std::string & input, char separator) -{ - std::vector parts; - size_t begin_pos = 0; - size_t separator_pos = input.find(separator); - while (separator_pos != std::string::npos) { - std::string part = input.substr(begin_pos, separator_pos - begin_pos); - parts.emplace_back(part); - begin_pos = separator_pos + 1; - separator_pos = input.find(separator, begin_pos); - } - parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos)); - return parts; -} - -static bool string_starts_with(const std::string & str, - const std::string & prefix) { // While we wait for C++20's std::string::starts_with... - return str.rfind(prefix, 0) == 0; -} - -static bool string_ends_with(const std::string & str, - const std::string & suffix) { // While we wait for C++20's std::string::ends_with... - return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; -} - bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); -std::string string_from(bool value); -std::string string_from(const std::vector & values); -std::string string_from(const struct llama_context * ctx, const std::vector & tokens); -std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); - // // Filesystem utils // @@ -508,193 +308,125 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -// note: defines object's lifetime -struct common_init_result { - llama_model_ptr model; - llama_context_ptr context; +// TODO: avoid tuplue, use struct +std::tuple llama_init_from_gpt_params(gpt_params & params); - std::vector lora; -}; +struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); -struct common_init_result common_init_from_params(common_params & params); +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model_params common_model_params_to_llama ( common_params & params); -struct llama_context_params common_context_params_to_llama(const common_params & params); -struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); - -struct llama_model * common_load_model_from_url( - const std::string & model_url, - const std::string & local_path, - const std::string & hf_token, - const struct llama_model_params & params); - -struct llama_model * common_load_model_from_hf( - const std::string & repo, - const std::string & remote_path, - const std::string & local_path, - const std::string & hf_token, - const struct llama_model_params & params); - -std::pair common_get_hf_file( - const std::string & hf_repo_with_tag, - const std::string & hf_token); - -// clear LoRA adapters from context, then apply new list of adapters -void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); - -// // Batch utils -// -void common_batch_clear(struct llama_batch & batch); +void llama_batch_clear(struct llama_batch & batch); -void common_batch_add( +void llama_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, const std::vector & seq_ids, bool logits); -// -// Token utils -// - -// longest common prefix -size_t common_lcp(const llama_tokens & a, const llama_tokens & b); - -// longet common subsequence -size_t common_lcs(const llama_tokens & a, const llama_tokens & b); - // // Vocab utils // // tokenizes a string into a vector of tokens // should work similar to Python's `tokenizer.encode` -std::vector common_tokenize( +std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special = false); -std::vector common_tokenize( - const struct llama_vocab * vocab, +std::vector llama_tokenize( + const struct llama_model * model, const std::string & text, bool add_special, bool parse_special = false); // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` -std::string common_token_to_piece( +std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token, bool special = true); -std::string common_token_to_piece( - const struct llama_vocab * vocab, - llama_token token, - bool special = true); - // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // optionally renders special/control tokens -std::string common_detokenize( - const struct llama_context * ctx, +std::string llama_detokenize( + llama_context * ctx, const std::vector & tokens, bool special = true); -std::string common_detokenize( - const struct llama_vocab * vocab, - const std::vector & tokens, - bool special = true); +// Uses the value from the model metadata if possible, otherwise +// defaults to true when model type is SPM, otherwise false. +bool llama_should_add_bos_token(const llama_model * model); // // Chat template utils // -struct common_tool_call { - std::string name; - std::string arguments; - std::string id; -}; - // same with llama_chat_message, but uses std::string -struct common_chat_msg { +struct llama_chat_msg { std::string role; std::string content; - std::vector tool_calls; - std::string tool_plan = ""; }; // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid -bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); - -namespace minja { - class chat_template; -} - -typedef minja::chat_template common_chat_template; - -struct common_chat_templates { - bool has_explicit_template; // Model had builtin template or template overridde was specified. - std::unique_ptr template_default; // always set (defaults to chatml) - std::unique_ptr template_tool_use; -}; +bool llama_chat_verify_template(const std::string & tmpl); // CPP wrapper for llama_chat_apply_template // If the built-in template is not supported, we default to chatml // If the custom "tmpl" is not supported, we throw an error -std::string common_chat_apply_template( - const common_chat_template & tmpl, - const std::vector & chat, - bool add_ass, - bool use_jinja); +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector & chat, + bool add_ass); // Format single message, while taking into account the position of that message in chat history -std::string common_chat_format_single( - const common_chat_template & tmpl, - const std::vector & past_msg, - const common_chat_msg & new_msg, - bool add_ass, - bool use_jinja); +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass); // Returns an example of formatted chat -std::string common_chat_format_example( - const common_chat_template & tmpl, bool use_jinja); - -common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override); +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl); // // KV cache utils // // Dump the KV cache view with the number of sequences per cell. -void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); +void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). -void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); // // Embedding utils // -// TODO: repace embd_norm with an enum -void common_embd_normalize(const float * inp, float * out, int n, int embd_norm); +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); -float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); // // Control vector utils // -struct common_control_vector_data { +struct llama_control_vector_data { int n_embd; // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd std::vector data; }; -struct common_control_vector_load_info { +struct llama_control_vector_load_info { float strength; std::string fname; @@ -702,16 +434,24 @@ struct common_control_vector_load_info { // Load control vectors, scale each by strength, and add them together. // On error, returns {-1, empty} -common_control_vector_data common_control_vector_load(const std::vector & load_infos); +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); // // Split utils // -namespace { +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; -const char * const LLM_KV_SPLIT_NO = "split.no"; -const char * const LLM_KV_SPLIT_COUNT = "split.count"; -const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; +// +// YAML utils +// -} +void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector & data); +void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector & data); +void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); + +void yaml_dump_non_result_info( + FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/common/console.cpp b/common/console.cpp index 078a8d678..f65cbc6ed 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -94,9 +94,6 @@ namespace console { simple_io = true; } } - if (simple_io) { - _setmode(_fileno(stdin), _O_U8TEXT); - } #else // POSIX-specific console initialization if (!simple_io) { diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp new file mode 100644 index 000000000..a518b766d --- /dev/null +++ b/common/grammar-parser.cpp @@ -0,0 +1,536 @@ +#include "grammar-parser.h" +#include +#include +#include +#include +#include +#include + +namespace grammar_parser { + // NOTE: assumes valid utf8 (but checks for overrun) + // copied from llama.cpp + static std::pair decode_utf8(const char * src) { + static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t first_byte = static_cast(*src); + uint8_t highbits = first_byte >> 4; + int len = lookup[highbits]; + uint8_t mask = (1 << (8 - len)) - 1; + uint32_t value = first_byte & mask; + const char * end = src + len; // may overrun! + const char * pos = src + 1; + for ( ; pos < end && *pos; pos++) { + value = (value << 6) + (static_cast(*pos) & 0x3F); + } + return std::make_pair(value, pos); + } + + static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { + uint32_t next_id = static_cast(state.symbol_ids.size()); + auto result = state.symbol_ids.emplace(std::string(src, len), next_id); + return result.first->second; + } + + static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { + uint32_t next_id = static_cast(state.symbol_ids.size()); + state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; + return next_id; + } + + static void add_rule( + parse_state & state, + uint32_t rule_id, + const std::vector & rule) { + if (state.rules.size() <= rule_id) { + state.rules.resize(rule_id + 1); + } + state.rules[rule_id] = rule; + } + + static bool is_digit_char(char c) { + return '0' <= c && c <= '9'; + } + + static bool is_word_char(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c); + } + + static std::pair parse_hex(const char * src, int size) { + const char * pos = src; + const char * end = src + size; + uint32_t value = 0; + for ( ; pos < end && *pos; pos++) { + value <<= 4; + char c = *pos; + if ('a' <= c && c <= 'f') { + value += c - 'a' + 10; + } else if ('A' <= c && c <= 'F') { + value += c - 'A' + 10; + } else if ('0' <= c && c <= '9') { + value += c - '0'; + } else { + break; + } + } + if (pos != end) { + throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src); + } + return std::make_pair(value, pos); + } + + static const char * parse_space(const char * src, bool newline_ok) { + const char * pos = src; + while (*pos == ' ' || *pos == '\t' || *pos == '#' || + (newline_ok && (*pos == '\r' || *pos == '\n'))) { + if (*pos == '#') { + while (*pos && *pos != '\r' && *pos != '\n') { + pos++; + } + } else { + pos++; + } + } + return pos; + } + + static const char * parse_name(const char * src) { + const char * pos = src; + while (is_word_char(*pos)) { + pos++; + } + if (pos == src) { + throw std::runtime_error(std::string("expecting name at ") + src); + } + return pos; + } + + static const char * parse_int(const char * src) { + const char * pos = src; + while (is_digit_char(*pos)) { + pos++; + } + if (pos == src) { + throw std::runtime_error(std::string("expecting integer at ") + src); + } + return pos; + } + + static std::pair parse_char(const char * src) { + if (*src == '\\') { + switch (src[1]) { + case 'x': return parse_hex(src + 2, 2); + case 'u': return parse_hex(src + 2, 4); + case 'U': return parse_hex(src + 2, 8); + case 't': return std::make_pair('\t', src + 2); + case 'r': return std::make_pair('\r', src + 2); + case 'n': return std::make_pair('\n', src + 2); + case '\\': + case '"': + case '[': + case ']': + return std::make_pair(src[1], src + 2); + default: + throw std::runtime_error(std::string("unknown escape at ") + src); + } + } else if (*src) { + return decode_utf8(src); + } + throw std::runtime_error("unexpected end of input"); + } + + const char * parse_alternates( + parse_state & state, + const char * src, + const std::string & rule_name, + uint32_t rule_id, + bool is_nested); + + static const char * parse_sequence( + parse_state & state, + const char * src, + const std::string & rule_name, + std::vector & out_elements, + bool is_nested) { + size_t last_sym_start = out_elements.size(); + const char * pos = src; + + auto handle_repetitions = [&](int min_times, int max_times) { + + if (last_sym_start == out_elements.size()) { + throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); + } + + // apply transformation to previous symbol (last_sym_start to end) according to + // the following rewrite rules: + // S{m,n} --> S S S (m times) S'(n-m) + // S'(x) ::= S S'(x-1) | + // (... n-m definitions of these S' rules ...) + // S'(1) ::= S | + // S{m,} --> S S S (m times) S' + // S' ::= S S' | + // S* --> S{0,} + // --> S' ::= S S' | + // S+ --> S{1,} + // --> S S' + // S' ::= S S' | + // S? --> S{0,1} + // --> S' + // S' ::= S | + + std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end()); + if (min_times == 0) { + out_elements.resize(last_sym_start); + } else { + // Repeat the previous elements (min_times - 1) times + for (int i = 1; i < min_times; i++) { + out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end()); + } + } + + uint32_t last_rec_rule_id = 0; + auto n_opt = max_times < 0 ? 1 : max_times - min_times; + + std::vector rec_rule(previous_elements); + for (int i = 0; i < n_opt; i++) { + rec_rule.resize(previous_elements.size()); + uint32_t rec_rule_id = generate_symbol_id(state, rule_name); + if (i > 0 || max_times < 0) { + rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); + } + rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); + rec_rule.push_back({LLAMA_GRETYPE_END, 0}); + add_rule(state, rec_rule_id, rec_rule); + last_rec_rule_id = rec_rule_id; + } + if (n_opt > 0) { + out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); + } + }; + + while (*pos) { + if (*pos == '"') { // literal string + pos++; + last_sym_start = out_elements.size(); + while (*pos != '"') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } + auto char_pair = parse_char(pos); + pos = char_pair.second; + out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); + } + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '[') { // char range(s) + pos++; + enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; + if (*pos == '^') { + pos++; + start_type = LLAMA_GRETYPE_CHAR_NOT; + } + last_sym_start = out_elements.size(); + while (*pos != ']') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } + auto char_pair = parse_char(pos); + pos = char_pair.second; + enum llama_gretype type = last_sym_start < out_elements.size() + ? LLAMA_GRETYPE_CHAR_ALT + : start_type; + + out_elements.push_back({type, char_pair.first}); + if (pos[0] == '-' && pos[1] != ']') { + if (!pos[1]) { + throw std::runtime_error("unexpected end of input"); + } + auto endchar_pair = parse_char(pos + 1); + pos = endchar_pair.second; + out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); + } + } + pos = parse_space(pos + 1, is_nested); + } else if (is_word_char(*pos)) { // rule reference + const char * name_end = parse_name(pos); + uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); + pos = parse_space(name_end, is_nested); + last_sym_start = out_elements.size(); + out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); + } else if (*pos == '(') { // grouping + // parse nested alternates into synthesized rule + pos = parse_space(pos + 1, true); + uint32_t sub_rule_id = generate_symbol_id(state, rule_name); + pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); + last_sym_start = out_elements.size(); + // output reference to synthesized rule + out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); + if (*pos != ')') { + throw std::runtime_error(std::string("expecting ')' at ") + pos); + } + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '.') { // any char + last_sym_start = out_elements.size(); + out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '*') { + pos = parse_space(pos + 1, is_nested); + handle_repetitions(0, -1); + } else if (*pos == '+') { + pos = parse_space(pos + 1, is_nested); + handle_repetitions(1, -1); + } else if (*pos == '?') { + pos = parse_space(pos + 1, is_nested); + handle_repetitions(0, 1); + } else if (*pos == '{') { + pos = parse_space(pos + 1, is_nested); + + if (!is_digit_char(*pos)) { + throw std::runtime_error(std::string("expecting an int at ") + pos); + } + const char * int_end = parse_int(pos); + int min_times = std::stoul(std::string(pos, int_end - pos)); + pos = parse_space(int_end, is_nested); + + int max_times = -1; + + if (*pos == '}') { + max_times = min_times; + pos = parse_space(pos + 1, is_nested); + } else if (*pos == ',') { + pos = parse_space(pos + 1, is_nested); + + if (is_digit_char(*pos)) { + const char * int_end = parse_int(pos); + max_times = std::stoul(std::string(pos, int_end - pos)); + pos = parse_space(int_end, is_nested); + } + + if (*pos != '}') { + throw std::runtime_error(std::string("expecting '}' at ") + pos); + } + pos = parse_space(pos + 1, is_nested); + } else { + throw std::runtime_error(std::string("expecting ',' at ") + pos); + } + handle_repetitions(min_times, max_times); + } else { + break; + } + } + return pos; + } + + const char * parse_alternates( + parse_state & state, + const char * src, + const std::string & rule_name, + uint32_t rule_id, + bool is_nested) { + std::vector rule; + const char * pos = parse_sequence(state, src, rule_name, rule, is_nested); + while (*pos == '|') { + rule.push_back({LLAMA_GRETYPE_ALT, 0}); + pos = parse_space(pos + 1, true); + pos = parse_sequence(state, pos, rule_name, rule, is_nested); + } + rule.push_back({LLAMA_GRETYPE_END, 0}); + add_rule(state, rule_id, rule); + return pos; + } + + static const char * parse_rule(parse_state & state, const char * src) { + const char * name_end = parse_name(src); + const char * pos = parse_space(name_end, false); + size_t name_len = name_end - src; + uint32_t rule_id = get_symbol_id(state, src, name_len); + const std::string name(src, name_len); + + if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { + throw std::runtime_error(std::string("expecting ::= at ") + pos); + } + pos = parse_space(pos + 3, true); + + pos = parse_alternates(state, pos, name, rule_id, false); + + if (*pos == '\r') { + pos += pos[1] == '\n' ? 2 : 1; + } else if (*pos == '\n') { + pos++; + } else if (*pos) { + throw std::runtime_error(std::string("expecting newline or end at ") + pos); + } + return parse_space(pos, true); + } + + parse_state parse(const char * src) { + try { + parse_state state; + const char * pos = parse_space(src, true); + while (*pos) { + pos = parse_rule(state, pos); + } + // Validate the state to ensure that all rules are defined + for (const auto & rule : state.rules) { + for (const auto & elem : rule) { + if (elem.type == LLAMA_GRETYPE_RULE_REF) { + // Ensure that the rule at that location exists + if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { + // Get the name of the rule that is missing + for (const auto & kv : state.symbol_ids) { + if (kv.second == elem.value) { + throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); + } + } + } + } + } + } + return state; + } catch (const std::exception & err) { + fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); + return parse_state(); + } + } + + static void print_grammar_char(FILE * file, uint32_t c) { + if (0x20 <= c && c <= 0x7f) { + fprintf(file, "%c", static_cast(c)); + } else { + // cop out of encoding UTF-8 + fprintf(file, "", c); + } + } + + static bool is_char_element(llama_grammar_element elem) { + switch (elem.type) { + case LLAMA_GRETYPE_CHAR: return true; + case LLAMA_GRETYPE_CHAR_NOT: return true; + case LLAMA_GRETYPE_CHAR_ALT: return true; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; + case LLAMA_GRETYPE_CHAR_ANY: return true; + default: return false; + } + } + + static void print_rule_binary(FILE * file, const std::vector & rule) { + for (auto elem : rule) { + switch (elem.type) { + case LLAMA_GRETYPE_END: fprintf(file, "END"); break; + case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break; + case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break; + case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break; + case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break; + case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break; + case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break; + } + switch (elem.type) { + case LLAMA_GRETYPE_END: + case LLAMA_GRETYPE_ALT: + case LLAMA_GRETYPE_RULE_REF: + fprintf(file, "(%u) ", elem.value); + break; + case LLAMA_GRETYPE_CHAR: + case LLAMA_GRETYPE_CHAR_NOT: + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + case LLAMA_GRETYPE_CHAR_ALT: + case LLAMA_GRETYPE_CHAR_ANY: + fprintf(file, "(\""); + print_grammar_char(file, elem.value); + fprintf(file, "\") "); + break; + } + } + fprintf(file, "\n"); + } + + static void print_rule( + FILE * file, + uint32_t rule_id, + const std::vector & rule, + const std::map & symbol_id_names) { + if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { + throw std::runtime_error( + "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); + } + fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); + for (size_t i = 0, end = rule.size() - 1; i < end; i++) { + llama_grammar_element elem = rule[i]; + switch (elem.type) { + case LLAMA_GRETYPE_END: + throw std::runtime_error( + "unexpected end of rule: " + std::to_string(rule_id) + "," + + std::to_string(i)); + case LLAMA_GRETYPE_ALT: + fprintf(file, "| "); + break; + case LLAMA_GRETYPE_RULE_REF: + fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); + break; + case LLAMA_GRETYPE_CHAR: + fprintf(file, "["); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_NOT: + fprintf(file, "[^"); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + if (i == 0 || !is_char_element(rule[i - 1])) { + throw std::runtime_error( + "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + + std::to_string(rule_id) + "," + std::to_string(i)); + } + fprintf(file, "-"); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_ALT: + if (i == 0 || !is_char_element(rule[i - 1])) { + throw std::runtime_error( + "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + + std::to_string(rule_id) + "," + std::to_string(i)); + } + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_ANY: + fprintf(file, "."); + break; + } + if (is_char_element(elem)) { + switch (rule[i + 1].type) { + case LLAMA_GRETYPE_CHAR_ALT: + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + case LLAMA_GRETYPE_CHAR_ANY: + break; + default: + fprintf(file, "] "); + } + } + } + fprintf(file, "\n"); + } + + void print_grammar(FILE * file, const parse_state & state) { + try { + std::map symbol_id_names; + for (const auto & kv : state.symbol_ids) { + symbol_id_names[kv.second] = kv.first; + } + for (size_t i = 0, end = state.rules.size(); i < end; i++) { + // fprintf(file, "%zu: ", i); + // print_rule_binary(file, state.rules[i]); + print_rule(file, uint32_t(i), state.rules[i], symbol_id_names); + // fprintf(file, "\n"); + } + } catch (const std::exception & err) { + fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what()); + } + } + + std::vector parse_state::c_rules() { + std::vector ret; + ret.reserve(rules.size()); + for (const auto & rule : rules) { + ret.push_back(rule.data()); + } + return ret; + } +} diff --git a/common/grammar-parser.h b/common/grammar-parser.h new file mode 100644 index 000000000..9037d7272 --- /dev/null +++ b/common/grammar-parser.h @@ -0,0 +1,29 @@ +// Implements a parser for an extended Backus-Naur form (BNF), producing the +// binary context-free grammar format specified by llama.h. Supports character +// ranges, grouping, and repetition operators. As an example, a grammar for +// arithmetic might look like: +// +// root ::= expr +// expr ::= term ([-+*/] term)* +// term ::= num | "(" space expr ")" space +// num ::= [0-9]+ space +// space ::= [ \t\n]* + +#pragma once +#include "llama.h" +#include +#include +#include +#include + +namespace grammar_parser { + struct parse_state { + std::map symbol_ids; + std::vector> rules; + + std::vector c_rules(); + }; + + parse_state parse(const char * src); + void print_grammar(FILE * file, const parse_state & state); +} diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 3ebcc3d9f..881eb49e3 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -1,6 +1,4 @@ #include "json-schema-to-grammar.h" -#include "common.h" - #include #include #include @@ -13,6 +11,11 @@ using json = nlohmann::ordered_json; +template +static std::string join(Iterator begin, Iterator end, const std::string & separator); + +static std::string repeat(const std::string & str, size_t n); + static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); @@ -125,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & if (sub_len > 0) { auto from_sub = from.substr(i + 1); auto to_sub = to.substr(i + 1); - auto sub_zeros = string_repeat("0", sub_len); - auto sub_nines = string_repeat("9", sub_len); + auto sub_zeros = repeat("0", sub_len); + auto sub_nines = repeat("9", sub_len); auto to_reached = false; out << "("; @@ -185,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & auto max_digits = max_s.length(); for (auto digits = min_digits; digits < max_digits; digits++) { - uniform_range(min_s, string_repeat("9", digits)); - min_s = "1" + string_repeat("0", digits); + uniform_range(min_s, repeat("9", digits)); + min_s = "1" + repeat("0", digits); out << " | "; } uniform_range(min_s, max_s); @@ -315,6 +318,49 @@ std::unordered_map GRAMMAR_LITERAL_ESCAPES = { std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; +template +std::string join(Iterator begin, Iterator end, const std::string & separator) { + std::ostringstream result; + if (begin != end) { + result << *begin; + for (Iterator it = begin + 1; it != end; ++it) { + result << separator << *it; + } + } + return result.str(); +} + +static std::vector split(const std::string & str, const std::string & delimiter) { + std::vector tokens; + size_t start = 0; + size_t end = str.find(delimiter); + + while (end != std::string::npos) { + tokens.push_back(str.substr(start, end - start)); + start = end + delimiter.length(); + end = str.find(delimiter, start); + } + + tokens.push_back(str.substr(start)); + + return tokens; +} + +static std::string repeat(const std::string & str, size_t n) { + if (n == 0) { + return ""; + } + + std::string result; + result.reserve(str.length() * n); + + for (size_t i = 0; i < n; ++i) { + result += str; + } + + return result; +} + static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function & replacement) { std::smatch match; std::string result; @@ -343,7 +389,6 @@ static std::string format_literal(const std::string & literal) { class SchemaConverter { private: - friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); std::function _fetch_json; bool _dotall; std::map _rules; @@ -373,7 +418,7 @@ private: for (size_t i = 0; i < alt_schemas.size(); i++) { rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i))); } - return string_join(rules, " | "); + return join(rules.begin(), rules.end(), " | "); } std::string _visit_pattern(const std::string & pattern, const std::string & name) { @@ -436,7 +481,7 @@ private: for (const auto & item : ret) { results.push_back(to_rule(item)); } - return std::make_pair(string_join(results, " "), false); + return std::make_pair(join(results.begin(), results.end(), " "), false); }; while (i < length) { @@ -494,7 +539,7 @@ private: } curly_brackets += '}'; i++; - auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ","); + auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ","); int min_times = 0; int max_times = std::numeric_limits::max(); try { @@ -566,7 +611,7 @@ private: } return join_seq(); }; - return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space"); + return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); } /* @@ -764,11 +809,10 @@ private: public: SchemaConverter( const std::function & fetch_json, - bool dotall, - bool compact_spaces) + bool dotall) : _fetch_json(fetch_json), _dotall(dotall) { - _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE; + _rules["space"] = SPACE_RULE; } void resolve_refs(json & schema, const std::string & url) { @@ -810,7 +854,7 @@ public: return; } std::string pointer = ref.substr(ref.find('#') + 1); - std::vector tokens = string_split(pointer, "/"); + std::vector tokens = split(pointer, "/"); for (size_t i = 1; i < tokens.size(); ++i) { std::string sel = tokens[i]; if (target.is_null() || !target.contains(sel)) { @@ -861,7 +905,7 @@ public: for (const auto & v : schema["enum"]) { enum_values.push_back(_generate_constant_rule(v)); } - return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space"); + return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space"); } else if ((schema_type.is_null() || schema_type == "object") && (schema.contains("properties") || (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) { @@ -975,10 +1019,10 @@ public: void check_errors() { if (!_errors.empty()) { - throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n")); + throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n")); } if (!_warnings.empty()) { - fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str()); + fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str()); } } @@ -991,35 +1035,11 @@ public: } }; -std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { -#ifdef LLAMA_USE_LLGUIDANCE - if (!force_gbnf) { - return "%llguidance {}\nstart: %json " + schema.dump(); - } -#else - (void)force_gbnf; -#endif // LLAMA_USE_LLGUIDANCE - return build_grammar([&](const common_grammar_builder & callbacks) { - auto copy = schema; - callbacks.resolve_refs(copy); - callbacks.add_schema("", copy); - }); -} - -std::string build_grammar(const std::function & cb, const common_grammar_options & options) { - SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces); - common_grammar_builder builder { - /* .add_rule = */ [&](const std::string & name, const std::string & rule) { - return converter._add_rule(name, rule); - }, - /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) { - return converter.visit(schema, name == "root" ? "" : name); - }, - /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) { - converter.resolve_refs(schema, ""); - } - }; - cb(builder); +std::string json_schema_to_grammar(const json & schema) { + SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false); + auto copy = schema; + converter.resolve_refs(copy, "input"); + converter.visit(copy, ""); converter.check_errors(); return converter.format_grammar(); } diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h index 62a3b0a44..41623b346 100644 --- a/common/json-schema-to-grammar.h +++ b/common/json-schema-to-grammar.h @@ -5,18 +5,4 @@ #define JSON_ASSERT GGML_ASSERT #include "json.hpp" -std::string json_schema_to_grammar(const nlohmann::ordered_json & schema, - bool force_gbnf = false); - -struct common_grammar_builder { - std::function add_rule; - std::function add_schema; - std::function resolve_refs; -}; - -struct common_grammar_options { - bool dotall = false; - bool compact_spaces = false; -}; - -std::string build_grammar(const std::function & cb, const common_grammar_options & options = {}); +std::string json_schema_to_grammar(const nlohmann::ordered_json& schema); diff --git a/common/llguidance.cpp b/common/llguidance.cpp deleted file mode 100644 index 2feeb93c8..000000000 --- a/common/llguidance.cpp +++ /dev/null @@ -1,270 +0,0 @@ -#include "sampling.h" -#include "log.h" - -#ifdef LLAMA_USE_LLGUIDANCE - -# include "llguidance.h" -# include - -struct llama_sampler_llg { - const llama_vocab * vocab; - std::string grammar_kind; - std::string grammar_data; - LlgTokenizer * tokenizer; - LlgConstraint * grammar; - LlgMaskResult llg_res; - bool has_llg_res; -}; - -static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind, - const char * grammar_data) { - LlgConstraintInit cinit; - llg_constraint_init_set_defaults(&cinit, tokenizer); - const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL"); - if (log_level && *log_level) { - cinit.log_stderr_level = atoi(log_level); - } - auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data); - if (llg_get_error(c)) { - LOG_ERR("llg error: %s\n", llg_get_error(c)); - llg_free_constraint(c); - return nullptr; - } - return c; -} - -static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) { - return "llguidance"; -} - -static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) { - auto * ctx = (llama_sampler_llg *) smpl->ctx; - if (ctx->grammar) { - LlgCommitResult res; - llg_commit_token(ctx->grammar, token, &res); - ctx->has_llg_res = false; - } -} - -static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_llg *) smpl->ctx; - if (ctx->grammar) { - if (!ctx->has_llg_res) { - if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) { - ctx->has_llg_res = true; - } else { - LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar)); - llg_free_constraint(ctx->grammar); - ctx->grammar = nullptr; - } - } - if (ctx->has_llg_res) { - if (ctx->llg_res.is_stop) { - for (size_t i = 0; i < cur_p->size; ++i) { - if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) { - cur_p->data[i].logit = -INFINITY; - } - } - } else { - const uint32_t * mask = ctx->llg_res.sample_mask; - for (size_t i = 0; i < cur_p->size; ++i) { - auto token = cur_p->data[i].id; - if ((mask[token / 32] & (1 << (token % 32))) == 0) { - cur_p->data[i].logit = -INFINITY; - } - } - } - } - } -} - -static void llama_sampler_llg_reset(llama_sampler * smpl) { - auto * ctx = (llama_sampler_llg *) smpl->ctx; - if (!ctx->grammar) { - return; - } - - auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str()); - llg_free_constraint(ctx->grammar); - ctx->grammar = grammar_new; - ctx->has_llg_res = false; -} - -static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_llg *) smpl->ctx; - - auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr); - - // copy the state - { - auto * result_ctx = (llama_sampler_llg *) result->ctx; - - if (ctx->grammar) { - result_ctx->grammar_kind = ctx->grammar_kind; - result_ctx->grammar_data = ctx->grammar_data; - result_ctx->grammar = llg_clone_constraint(ctx->grammar); - result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer); - } - } - - return result; -} - -static void llama_sampler_llg_free(llama_sampler * smpl) { - const auto * ctx = (llama_sampler_llg *) smpl->ctx; - - if (ctx->grammar) { - llg_free_constraint(ctx->grammar); - llg_free_tokenizer(ctx->tokenizer); - } - - delete ctx; -} - -static llama_sampler_i llama_sampler_llg_i = { - /* .name = */ llama_sampler_llg_name, - /* .accept = */ llama_sampler_llg_accept_impl, - /* .apply = */ llama_sampler_llg_apply, - /* .reset = */ llama_sampler_llg_reset, - /* .clone = */ llama_sampler_llg_clone, - /* .free = */ llama_sampler_llg_free, -}; - -static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len, - uint32_t * output_tokens, size_t output_tokens_len) { - const llama_vocab * vocab = (const llama_vocab *) user_data; - int r = 0; - try { - r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false, - true); - } catch (const std::exception & e) { - GGML_ABORT("llama_tokenize failed: %s\n", e.what()); - } - if (r < 0) { - return -r; - } - return r; -} - -static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) { - // TODO store the tokenizer in the vocab somehow - static const llama_vocab * vocab_cache; - static LlgTokenizer * tokenizer_cache; - - if (vocab_cache == vocab) { - return llg_clone_tokenizer(tokenizer_cache); - } - - auto tok_eos = llama_vocab_eot(vocab); - if (tok_eos == LLAMA_TOKEN_NULL) { - tok_eos = llama_vocab_eos(vocab); - } - - size_t vocab_size = llama_vocab_n_tokens(vocab); - - auto token_lens = new uint32_t[vocab_size]; - // we typically have ~7 bytes per token; let's go on the safe side here - auto token_bytes_size = vocab_size * 16 + 1024 * 1024; - auto token_bytes = new uint8_t[token_bytes_size]; - - size_t offset = 0; - for (size_t i = 0; i < vocab_size; i++) { - size_t max_token = 1024; - if (token_bytes_size - offset < max_token) { - GGML_ABORT("token_bytes buffer too small\n"); - } - - llama_token token = i; - auto dp = (char *) token_bytes + offset; - auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false); - if (size < 0) { - GGML_ABORT("llama_detokenize failed\n"); - } - if (size == 0) { - size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true); - if (size < 0) { - GGML_ABORT("llama_detokenize failed\n"); - } - if (size != 0) { - *dp = '\xff'; // special token prefix marker - size += 1; - } - } - - token_lens[i] = size; - offset += size; - } - - LlgTokenizerInit tinit = { - /* .vocab_size = */ (uint32_t) vocab_size, - /* .tok_eos = */ (uint32_t) tok_eos, - /* .token_lens = */ token_lens, - /* .token_bytes = */ token_bytes, - /* .tokenizer_json = */ nullptr, - /* .tokenize_assumes_string = */ true, - /* .tokenize_fn = */ llama_sampler_llg_tokenize_fn, - /* .use_approximate_greedy_tokenize_fn = */ false, - /* .tokenize_user_data = */ vocab, - }; - - char error_buffer[1024]; - LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer)); - - delete[] token_bytes; - delete[] token_lens; - - if (tokenizer == nullptr) { - LOG_ERR("llg tokenizer error: %s\n", error_buffer); - return tokenizer; - } - - if (tokenizer_cache) { - llg_free_tokenizer(tokenizer_cache); - } - vocab_cache = vocab; - tokenizer_cache = tokenizer; - - return llg_clone_tokenizer(tokenizer_cache); -} - -llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind, - const char * grammar_data) { - auto * ctx = new llama_sampler_llg; - - if (grammar_kind != nullptr && grammar_kind[0] != '\0') { - auto tokenizer = llama_sampler_llg_new_tokenizer(vocab); - *ctx = { - /* .vocab = */ vocab, - /* .grammar_kind = */ grammar_kind, - /* .grammar_data = */ grammar_data, - /* .tokenizer = */ tokenizer, - /* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data), - /* .llg_res = */ {}, - /* .has_llg_res = */ false, - }; - } else { - *ctx = { - /* .vocab = */ vocab, - /* .grammar_kind = */ {}, - /* .grammar_data = */ {}, - /* .tokenizer = */ nullptr, - /* .grammar = */ nullptr, - /* .llg_res = */ {}, - /* .has_llg_res = */ false, - }; - } - - return llama_sampler_init( - /* .iface = */ &llama_sampler_llg_i, - /* .ctx = */ ctx - ); -} - -#else - -llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) { - LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); - return nullptr; -} - -#endif // LLAMA_USE_LLGUIDANCE diff --git a/common/log.cpp b/common/log.cpp deleted file mode 100644 index 4bfbecf15..000000000 --- a/common/log.cpp +++ /dev/null @@ -1,392 +0,0 @@ -#include "log.h" - -#include -#include -#include -#include -#include -#include -#include - -int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; - -void common_log_set_verbosity_thold(int verbosity) { - common_log_verbosity_thold = verbosity; -} - -static int64_t t_us() { - return std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); -} - -// colors -enum common_log_col : int { - COMMON_LOG_COL_DEFAULT = 0, - COMMON_LOG_COL_BOLD, - COMMON_LOG_COL_RED, - COMMON_LOG_COL_GREEN, - COMMON_LOG_COL_YELLOW, - COMMON_LOG_COL_BLUE, - COMMON_LOG_COL_MAGENTA, - COMMON_LOG_COL_CYAN, - COMMON_LOG_COL_WHITE, -}; - -// disable colors by default -static std::vector g_col = { - "", - "", - "", - "", - "", - "", - "", - "", - "", -}; - -struct common_log_entry { - enum ggml_log_level level; - - bool prefix; - - int64_t timestamp; - - std::vector msg; - - // signals the worker thread to stop - bool is_end; - - void print(FILE * file = nullptr) const { - FILE * fcur = file; - if (!fcur) { - // stderr displays DBG messages only when their verbosity level is not higher than the threshold - // these messages will still be logged to a file - if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) { - return; - } - - fcur = stdout; - - if (level != GGML_LOG_LEVEL_NONE) { - fcur = stderr; - } - } - - if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) { - if (timestamp) { - // [M.s.ms.us] - fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", - g_col[COMMON_LOG_COL_BLUE], - (int) (timestamp / 1000000 / 60), - (int) (timestamp / 1000000 % 60), - (int) (timestamp / 1000 % 1000), - (int) (timestamp % 1000), - g_col[COMMON_LOG_COL_DEFAULT]); - } - - switch (level) { - case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break; - case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break; - case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break; - case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break; - default: - break; - } - } - - fprintf(fcur, "%s", msg.data()); - - if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { - fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]); - } - - fflush(fcur); - } -}; - -struct common_log { - // default capacity - will be expanded if needed - common_log() : common_log(256) {} - - common_log(size_t capacity) { - file = nullptr; - prefix = false; - timestamps = false; - running = false; - t_start = t_us(); - - // initial message size - will be expanded if longer messages arrive - entries.resize(capacity); - for (auto & entry : entries) { - entry.msg.resize(256); - } - - head = 0; - tail = 0; - - resume(); - } - - ~common_log() { - pause(); - if (file) { - fclose(file); - } - } - -private: - std::mutex mtx; - std::thread thrd; - std::condition_variable cv; - - FILE * file; - - bool prefix; - bool timestamps; - bool running; - - int64_t t_start; - - // ring buffer of entries - std::vector entries; - size_t head; - size_t tail; - - // worker thread copies into this - common_log_entry cur; - -public: - void add(enum ggml_log_level level, const char * fmt, va_list args) { - std::lock_guard lock(mtx); - - if (!running) { - // discard messages while the worker thread is paused - return; - } - - auto & entry = entries[tail]; - - { - // cannot use args twice, so make a copy in case we need to expand the buffer - va_list args_copy; - va_copy(args_copy, args); - -#if 1 - const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args); - if (n >= entry.msg.size()) { - entry.msg.resize(n + 1); - vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy); - } -#else - // hack for bolding arguments - - std::stringstream ss; - for (int i = 0; fmt[i] != 0; i++) { - if (fmt[i] == '%') { - ss << LOG_COL_BOLD; - while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++]; - ss << LOG_COL_DEFAULT; - if (fmt[i] == 0) break; - } - ss << fmt[i]; - } - const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args); - if (n >= entry.msg.size()) { - entry.msg.resize(n + 1); - vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy); - } -#endif - va_end(args_copy); - } - - entry.level = level; - entry.prefix = prefix; - entry.timestamp = 0; - if (timestamps) { - entry.timestamp = t_us() - t_start; - } - entry.is_end = false; - - tail = (tail + 1) % entries.size(); - if (tail == head) { - // expand the buffer - std::vector new_entries(2*entries.size()); - - size_t new_tail = 0; - - do { - new_entries[new_tail] = std::move(entries[head]); - - head = (head + 1) % entries.size(); - new_tail = (new_tail + 1); - } while (head != tail); - - head = 0; - tail = new_tail; - - for (size_t i = tail; i < new_entries.size(); i++) { - new_entries[i].msg.resize(256); - } - - entries = std::move(new_entries); - } - - cv.notify_one(); - } - - void resume() { - std::lock_guard lock(mtx); - - if (running) { - return; - } - - running = true; - - thrd = std::thread([this]() { - while (true) { - { - std::unique_lock lock(mtx); - cv.wait(lock, [this]() { return head != tail; }); - - cur = entries[head]; - - head = (head + 1) % entries.size(); - } - - if (cur.is_end) { - break; - } - - cur.print(); // stdout and stderr - - if (file) { - cur.print(file); - } - } - }); - } - - void pause() { - { - std::lock_guard lock(mtx); - - if (!running) { - return; - } - - running = false; - - // push an entry to signal the worker thread to stop - { - auto & entry = entries[tail]; - entry.is_end = true; - - tail = (tail + 1) % entries.size(); - } - - cv.notify_one(); - } - - thrd.join(); - } - - void set_file(const char * path) { - pause(); - - if (file) { - fclose(file); - } - - if (path) { - file = fopen(path, "w"); - } else { - file = nullptr; - } - - resume(); - } - - void set_colors(bool colors) { - pause(); - - if (colors) { - g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; - g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD; - g_col[COMMON_LOG_COL_RED] = LOG_COL_RED; - g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN; - g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW; - g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE; - g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; - g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN; - g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE; - } else { - for (size_t i = 0; i < g_col.size(); i++) { - g_col[i] = ""; - } - } - - resume(); - } - - void set_prefix(bool prefix) { - std::lock_guard lock(mtx); - - this->prefix = prefix; - } - - void set_timestamps(bool timestamps) { - std::lock_guard lock(mtx); - - this->timestamps = timestamps; - } -}; - -// -// public API -// - -struct common_log * common_log_init() { - return new common_log; -} - -struct common_log * common_log_main() { - static struct common_log log; - - return &log; -} - -void common_log_pause(struct common_log * log) { - log->pause(); -} - -void common_log_resume(struct common_log * log) { - log->resume(); -} - -void common_log_free(struct common_log * log) { - delete log; -} - -void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) { - va_list args; - va_start(args, fmt); - log->add(level, fmt, args); - va_end(args); -} - -void common_log_set_file(struct common_log * log, const char * file) { - log->set_file(file); -} - -void common_log_set_colors(struct common_log * log, bool colors) { - log->set_colors(colors); -} - -void common_log_set_prefix(struct common_log * log, bool prefix) { - log->set_prefix(prefix); -} - -void common_log_set_timestamps(struct common_log * log, bool timestamps) { - log->set_timestamps(timestamps); -} diff --git a/common/log.h b/common/log.h index 4ebc6314b..1bc5328ce 100644 --- a/common/log.h +++ b/common/log.h @@ -1,103 +1,724 @@ #pragma once -#include "ggml.h" // for ggml_log_level +#include +#include +#include +#include +#include +#include +#include +#include -#define LOG_CLR_TO_EOL "\033[K\r" -#define LOG_COL_DEFAULT "\033[0m" -#define LOG_COL_BOLD "\033[1m" -#define LOG_COL_RED "\033[31m" -#define LOG_COL_GREEN "\033[32m" -#define LOG_COL_YELLOW "\033[33m" -#define LOG_COL_BLUE "\033[34m" -#define LOG_COL_MAGENTA "\033[35m" -#define LOG_COL_CYAN "\033[36m" -#define LOG_COL_WHITE "\033[37m" +// -------------------------------- +// +// Basic usage: +// +// -------- +// +// The LOG() and LOG_TEE() macros are ready to go by default +// they do not require any initialization. +// +// LOGLN() and LOG_TEELN() are variants which automatically +// include \n character at the end of the log string. +// +// LOG() behaves exactly like printf, by default writing to a logfile. +// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ). +// +// Default logfile is named +// "llama..log" +// Default LOG_TEE() secondary output target is +// stderr +// +// Logs can be dynamically disabled or enabled using functions: +// log_disable() +// and +// log_enable() +// +// A log target can be changed with: +// log_set_target( string ) +// creating and opening, or re-opening a file by string filename +// or +// log_set_target( FILE* ) +// allowing to point at stderr, stdout, or any valid FILE* file handler. +// +// -------- +// +// End of Basic usage. +// +// -------------------------------- -#ifndef __GNUC__ -# define LOG_ATTRIBUTE_FORMAT(...) -#elif defined(__MINGW32__) -# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +// Specifies a log target. +// default uses log_handler() with "llama.log" log file +// this can be changed, by defining LOG_TARGET +// like so: +// +// #define LOG_TARGET (a valid FILE*) +// #include "log.h" +// +// or it can be simply redirected to stdout or stderr +// like so: +// +// #define LOG_TARGET stderr +// #include "log.h" +// +// The log target can also be redirected to a different function +// like so: +// +// #define LOG_TARGET log_handler_different() +// #include "log.h" +// +// FILE* log_handler_different() +// { +// return stderr; +// } +// +// or: +// +// #define LOG_TARGET log_handler_another_one("somelog.log") +// #include "log.h" +// +// FILE* log_handler_another_one(char*filename) +// { +// static FILE* logfile = nullptr; +// (...) +// if( !logfile ) +// { +// fopen(...) +// } +// (...) +// return logfile +// } +// +#ifndef LOG_TARGET + #define LOG_TARGET log_handler() #endif -#define LOG_DEFAULT_DEBUG 1 -#define LOG_DEFAULT_LLAMA 0 +#ifndef LOG_TEE_TARGET + #define LOG_TEE_TARGET stderr +#endif -// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower -// set via common_log_set_verbosity() -extern int common_log_verbosity_thold; +// Utility for synchronizing log configuration state +// since std::optional was introduced only in c++17 +enum LogTriState +{ + LogTriStateSame, + LogTriStateFalse, + LogTriStateTrue +}; -void common_log_set_verbosity_thold(int verbosity); // not thread-safe +// Utility to obtain "pid" like unique process id and use it when creating log files. +inline std::string log_get_pid() +{ + static std::string pid; + if (pid.empty()) + { + // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // it's not the same as "pid" but is unique enough to solve multiple instances + // trying to write to the same log. + std::stringstream ss; + ss << std::this_thread::get_id(); + pid = ss.str(); + } -// the common_log uses an internal worker thread to print/write log messages -// when the worker thread is paused, incoming log messages are discarded -struct common_log; + return pid; +} -struct common_log * common_log_init(); -struct common_log * common_log_main(); // singleton, automatically destroys itself on exit -void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe -void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe -void common_log_free (struct common_log * log); +// Utility function for generating log file names with unique id based on thread id. +// invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" +// where the number is a runtime id of the current thread. -LOG_ATTRIBUTE_FORMAT(3, 4) -void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...); +#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) -// defaults: file = NULL, colors = false, prefix = false, timestamps = false -// -// regular log output: -// -// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) -// llm_load_tensors: ggml ctx size = 0.27 MiB -// llm_load_tensors: offloading 32 repeating layers to GPU -// llm_load_tensors: offloading non-repeating layers to GPU -// -// with prefix = true, timestamps = true, the log output will look like this: -// -// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) -// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB -// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU -// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU -// -// I - info (stdout, V = 0) -// W - warning (stderr, V = 0) -// E - error (stderr, V = 0) -// D - debug (stderr, V = LOG_DEFAULT_DEBUG) -// +// INTERNAL, DO NOT USE +inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) +{ + static bool _multilog = false; -void common_log_set_file (struct common_log * log, const char * file); // not thread-safe -void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe -void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log -void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix + if (multilog != LogTriStateSame) + { + _multilog = multilog == LogTriStateTrue; + } -// helper macros for logging -// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold -// -// for example: -// -// LOG_DBG("this is a debug message: %d\n", expensive_function()); -// -// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold -// + std::stringstream buf; -#define LOG_TMPL(level, verbosity, ...) \ - do { \ - if ((verbosity) <= common_log_verbosity_thold) { \ - common_log_add(common_log_main(), (level), __VA_ARGS__); \ - } \ + buf << log_file_basename; + if (_multilog) + { + buf << "."; + buf << log_get_pid(); + } + buf << "."; + buf << log_file_extension; + + return buf.str(); +} + +#ifndef LOG_DEFAULT_FILE_NAME + #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log") +#endif + +// Utility for turning #define values into string literals +// so we can have a define for stderr and +// we can print "stderr" instead of literal stderr, etc. +#define LOG_STRINGIZE1(s) #s +#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s) + +#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET) + +// Allows disabling timestamps. +// in order to disable, define LOG_NO_TIMESTAMPS +// like so: +// +// #define LOG_NO_TIMESTAMPS +// #include "log.h" +// +#ifndef LOG_NO_TIMESTAMPS + #ifndef _MSC_VER + #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #else + #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #endif +#else + #define LOG_TIMESTAMP_FMT "%s" + #define LOG_TIMESTAMP_VAL ,"" +#endif + +#ifdef LOG_TEE_TIMESTAMPS + #ifndef _MSC_VER + #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #else + #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #endif +#else + #define LOG_TEE_TIMESTAMP_FMT "%s" + #define LOG_TEE_TIMESTAMP_VAL ,"" +#endif + +// Allows disabling file/line/function prefix +// in order to disable, define LOG_NO_FILE_LINE_FUNCTION +// like so: +// +// #define LOG_NO_FILE_LINE_FUNCTION +// #include "log.h" +// +#ifndef LOG_NO_FILE_LINE_FUNCTION + #ifndef _MSC_VER + #define LOG_FLF_FMT "[%24s:%5d][%24s] " + #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #else + #define LOG_FLF_FMT "[%24s:%5ld][%24s] " + #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ + #endif +#else + #define LOG_FLF_FMT "%s" + #define LOG_FLF_VAL ,"" +#endif + +#ifdef LOG_TEE_FILE_LINE_FUNCTION + #ifndef _MSC_VER + #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " + #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #else + #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " + #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ + #endif +#else + #define LOG_TEE_FLF_FMT "%s" + #define LOG_TEE_FLF_VAL ,"" +#endif + +// INTERNAL, DO NOT USE +// USE LOG() INSTEAD +// +#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) + #define LOG_IMPL(str, ...) \ + do { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ } while (0) +#else + #define LOG_IMPL(str, ...) \ + do { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + } while (0) +#endif -#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) -#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) +// INTERNAL, DO NOT USE +// USE LOG_TEE() INSTEAD +// +#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) + #define LOG_TEE_IMPL(str, ...) \ + do { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ + { \ + fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TEE_TARGET); \ + } \ + } while (0) +#else + #define LOG_TEE_IMPL(str, ...) \ + do { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ + { \ + fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TEE_TARGET); \ + } \ + } while (0) +#endif -#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) -#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) -#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) -#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) -#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__) +// The '\0' as a last argument, is a trick to bypass the silly +// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" +// so we can have a single macro which can be called just like printf. -#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) -#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) -#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__) -#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__) -#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__) +// Main LOG macro. +// behaves like printf, and supports arguments the exact same way. +// +#if !defined(_MSC_VER) || defined(__clang__) + #define LOG(...) LOG_IMPL(__VA_ARGS__, "") +#else + #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") +#endif + +// Main TEE macro. +// does the same as LOG +// and +// simultaneously writes stderr. +// +// Secondary target can be changed just like LOG_TARGET +// by defining LOG_TEE_TARGET +// +#if !defined(_MSC_VER) || defined(__clang__) + #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") +#else + #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") +#endif + +// LOG macro variants with auto endline. +#if !defined(_MSC_VER) || defined(__clang__) + #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") + #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") +#else + #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n") + #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n") +#endif + +// INTERNAL, DO NOT USE +inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) +{ + static bool _initialized = false; + static bool _append = false; + static bool _disabled = filename.empty() && target == nullptr; + static std::string log_current_filename{filename}; + static FILE *log_current_target{target}; + static FILE *logfile = nullptr; + + if (change) + { + if (append != LogTriStateSame) + { + _append = append == LogTriStateTrue; + return logfile; + } + + if (disable == LogTriStateTrue) + { + // Disable primary target + _disabled = true; + } + // If previously disabled, only enable, and keep previous target + else if (disable == LogTriStateFalse) + { + _disabled = false; + } + // Otherwise, process the arguments + else if (log_current_filename != filename || log_current_target != target) + { + _initialized = false; + } + } + + if (_disabled) + { + // Log is disabled + return nullptr; + } + + if (_initialized) + { + // with fallback in case something went wrong + return logfile ? logfile : stderr; + } + + // do the (re)initialization + if (target != nullptr) + { + if (logfile != nullptr && logfile != stdout && logfile != stderr) + { + fclose(logfile); + } + + log_current_filename = LOG_DEFAULT_FILE_NAME; + log_current_target = target; + + logfile = target; + } + else + { + if (log_current_filename != filename) + { + if (logfile != nullptr && logfile != stdout && logfile != stderr) + { + fclose(logfile); + } + } + + logfile = fopen(filename.c_str(), _append ? "a" : "w"); + } + + if (!logfile) + { + // Verify whether the file was opened, otherwise fallback to stderr + logfile = stderr; + + fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno)); + fflush(stderr); + + // At this point we let the init flag be to true below, and let the target fallback to stderr + // otherwise we would repeatedly fopen() which was already unsuccessful + } + + _initialized = true; + + return logfile ? logfile : stderr; +} + +// INTERNAL, DO NOT USE +inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) +{ + return log_handler1_impl(change, append, disable, filename, target); +} + +// Disables logs entirely at runtime. +// Makes LOG() and LOG_TEE() produce no output, +// until enabled back. +#define log_disable() log_disable_impl() + +// INTERNAL, DO NOT USE +inline FILE *log_disable_impl() +{ + return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); +} + +// Enables logs at runtime. +#define log_enable() log_enable_impl() + +// INTERNAL, DO NOT USE +inline FILE *log_enable_impl() +{ + return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); +} + +// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) +#define log_set_target(target) log_set_target_impl(target) + +// INTERNAL, DO NOT USE +inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } +inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } + +// INTERNAL, DO NOT USE +inline FILE *log_handler() { return log_handler1_impl(); } + +// Enable or disable creating separate log files for each run. +// can ONLY be invoked BEFORE first log use. +#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") +// Enable or disable append mode for log file. +// can ONLY be invoked BEFORE first log use. +#define log_append(enable) log_append_impl(enable) +// INTERNAL, DO NOT USE +inline FILE *log_append_impl(bool enable) +{ + return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); +} + +inline void log_test() +{ + log_disable(); + LOG("01 Hello World to nobody, because logs are disabled!\n"); + log_enable(); + LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); + LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); + log_set_target(stderr); + LOG("04 Hello World to stderr!\n"); + LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); + log_set_target(LOG_DEFAULT_FILE_NAME); + LOG("06 Hello World to default log file!\n"); + log_set_target(stdout); + LOG("07 Hello World to stdout!\n"); + log_set_target(LOG_DEFAULT_FILE_NAME); + LOG("08 Hello World to default log file again!\n"); + log_disable(); + LOG("09 Hello World _1_ into the void!\n"); + log_enable(); + LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); + log_disable(); + log_set_target("llama.anotherlog.log"); + LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); + log_enable(); + LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); + log_set_target("llama.yetanotherlog.log"); + LOG("13 Hello World this time in yet new file?\n"); + log_set_target(log_filename_generator("llama_autonamed", "log")); + LOG("14 Hello World in log with generated filename!\n"); +#ifdef _MSC_VER + LOG_TEE("15 Hello msvc TEE without arguments\n"); + LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); + LOG_TEELN("17 Hello msvc TEELN without arguments\n"); + LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); + LOG("19 Hello msvc LOG without arguments\n"); + LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); + LOGLN("21 Hello msvc LOGLN without arguments\n"); + LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); +#endif +} + +inline bool log_param_single_parse(const std::string & param) +{ + if ( param == "--log-test") + { + log_test(); + return true; + } + + if ( param == "--log-disable") + { + log_disable(); + return true; + } + + if ( param == "--log-enable") + { + log_enable(); + return true; + } + + if (param == "--log-new") + { + log_multilog(true); + return true; + } + + if (param == "--log-append") + { + log_append(true); + return true; + } + + return false; +} + +inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string()) +{ + if ( param == "--log-file") + { + if (!check_but_dont_parse) + { + log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log")); + } + + return true; + } + + return false; +} + +inline void log_print_usage() +{ + printf("log options:\n"); + /* format + printf(" -h, --help show this help message and exit\n");*/ + /* spacing + printf("__-param----------------Description\n");*/ + printf(" --log-test Run simple logging test\n"); + printf(" --log-disable Disable trace logs\n"); + printf(" --log-enable Enable trace logs\n"); + printf(" --log-file Specify a log filename (without extension)\n"); + printf(" --log-new Create a separate new log file on start. " + "Each log file will have unique name: \"..log\"\n"); + printf(" --log-append Don't truncate the old log file.\n"); + printf("\n"); +} + +#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) + +// INTERNAL, DO NOT USE +inline void log_dump_cmdline_impl(int argc, char **argv) +{ + std::stringstream buf; + for (int i = 0; i < argc; ++i) + { + if (std::string(argv[i]).find(' ') != std::string::npos) + { + buf << " \"" << argv[i] <<"\""; + } + else + { + buf << " " << argv[i]; + } + } + LOGLN("Cmd:%s", buf.str().c_str()); +} + +#define log_tostr(var) log_var_to_string_impl(var).c_str() + +inline std::string log_var_to_string_impl(bool var) +{ + return var ? "true" : "false"; +} + +inline std::string log_var_to_string_impl(std::string var) +{ + return var; +} + +inline std::string log_var_to_string_impl(const std::vector & var) +{ + std::stringstream buf; + buf << "[ "; + bool first = true; + for (auto e : var) + { + if (first) + { + first = false; + } + else + { + buf << ", "; + } + buf << std::to_string(e); + } + buf << " ]"; + + return buf.str(); +} + +template +inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens) +{ + std::stringstream buf; + buf << "[ "; + + bool first = true; + for (const auto & token : tokens) + { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, token); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf + << "'" << detokenized << "'" + << ":" << std::to_string(token); + } + buf << " ]"; + + return buf.str(); +} + +template +inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch) +{ + std::stringstream buf; + buf << "[ "; + + bool first = true; + for (int i = 0; i < batch.n_tokens; ++i) + { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, batch.token[i]); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf + << "\n" << std::to_string(i) + << ":token '" << detokenized << "'" + << ":pos " << std::to_string(batch.pos[i]) + << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ":seq_id " << std::to_string(batch.seq_id[i][0]) + << ":logits " << std::to_string(batch.logits[i]); + } + buf << " ]"; + + return buf.str(); +} + +#ifdef LOG_DISABLE_LOGS + +#undef LOG +#define LOG(...) // dummy stub +#undef LOGLN +#define LOGLN(...) // dummy stub + +#undef LOG_TEE +#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf + +#undef LOG_TEELN +#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf + +#undef LOG_DISABLE +#define LOG_DISABLE() // dummy stub + +#undef LOG_ENABLE +#define LOG_ENABLE() // dummy stub + +#undef LOG_ENABLE +#define LOG_ENABLE() // dummy stub + +#undef LOG_SET_TARGET +#define LOG_SET_TARGET(...) // dummy stub + +#undef LOG_DUMP_CMDLINE +#define LOG_DUMP_CMDLINE(...) // dummy stub + +#endif // LOG_DISABLE_LOGS diff --git a/common/minja.hpp b/common/minja.hpp deleted file mode 100644 index c58dd66e0..000000000 --- a/common/minja.hpp +++ /dev/null @@ -1,2883 +0,0 @@ -/* - Copyright 2024 Google LLC - - Use of this source code is governed by an MIT-style - license that can be found in the LICENSE file or at - https://opensource.org/licenses/MIT. -*/ -// SPDX-License-Identifier: MIT -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using json = nlohmann::ordered_json; - -namespace minja { - -class Context; - -struct Options { - bool trim_blocks; // removes the first newline after a block - bool lstrip_blocks; // removes leading whitespace on the line of the block - bool keep_trailing_newline; // don't remove last newline -}; - -struct ArgumentsValue; - -inline std::string normalize_newlines(const std::string & s) { -#ifdef _WIN32 - static const std::regex nl_regex("\r\n"); - return std::regex_replace(s, nl_regex, "\n"); -#else - return s; -#endif -} - -/* Values that behave roughly like in Python. */ -class Value : public std::enable_shared_from_this { -public: - using CallableType = std::function &, ArgumentsValue &)>; - using FilterType = std::function &, ArgumentsValue &)>; - -private: - using ObjectType = nlohmann::ordered_map; // Only contains primitive keys - using ArrayType = std::vector; - - std::shared_ptr array_; - std::shared_ptr object_; - std::shared_ptr callable_; - json primitive_; - - Value(const std::shared_ptr & array) : array_(array) {} - Value(const std::shared_ptr & object) : object_(object) {} - Value(const std::shared_ptr & callable) : object_(std::make_shared()), callable_(callable) {} - - /* Python-style string repr */ - static void dump_string(const json & primitive, std::ostringstream & out, char string_quote = '\'') { - if (!primitive.is_string()) throw std::runtime_error("Value is not a string: " + primitive.dump()); - auto s = primitive.dump(); - if (string_quote == '"' || s.find('\'') != std::string::npos) { - out << s; - return; - } - // Reuse json dump, just changing string quotes - out << string_quote; - for (size_t i = 1, n = s.size() - 1; i < n; ++i) { - if (s[i] == '\\' && s[i + 1] == '"') { - out << '"'; - i++; - } else if (s[i] == string_quote) { - out << '\\' << string_quote; - } else { - out << s[i]; - } - } - out << string_quote; - } - void dump(std::ostringstream & out, int indent = -1, int level = 0, bool to_json = false) const { - auto print_indent = [&](int level) { - if (indent > 0) { - out << "\n"; - for (int i = 0, n = level * indent; i < n; ++i) out << ' '; - } - }; - auto print_sub_sep = [&]() { - out << ','; - if (indent < 0) out << ' '; - else print_indent(level + 1); - }; - - auto string_quote = to_json ? '"' : '\''; - - if (is_null()) out << "null"; - else if (array_) { - out << "["; - print_indent(level + 1); - for (size_t i = 0; i < array_->size(); ++i) { - if (i) print_sub_sep(); - (*array_)[i].dump(out, indent, level + 1, to_json); - } - print_indent(level); - out << "]"; - } else if (object_) { - out << "{"; - print_indent(level + 1); - for (auto begin = object_->begin(), it = begin; it != object_->end(); ++it) { - if (it != begin) print_sub_sep(); - if (it->first.is_string()) { - dump_string(it->first, out, string_quote); - } else { - out << string_quote << it->first.dump() << string_quote; - } - out << ": "; - it->second.dump(out, indent, level + 1, to_json); - } - print_indent(level); - out << "}"; - } else if (callable_) { - throw std::runtime_error("Cannot dump callable to JSON"); - } else if (is_boolean() && !to_json) { - out << (this->to_bool() ? "True" : "False"); - } else if (is_string() && !to_json) { - dump_string(primitive_, out, string_quote); - } else { - out << primitive_.dump(); - } - } - -public: - Value() {} - Value(const bool& v) : primitive_(v) {} - Value(const int64_t & v) : primitive_(v) {} - Value(const double& v) : primitive_(v) {} - Value(const std::nullptr_t &) {} - Value(const std::string & v) : primitive_(v) {} - Value(const char * v) : primitive_(std::string(v)) {} - - Value(const json & v) { - if (v.is_object()) { - auto object = std::make_shared(); - for (auto it = v.begin(); it != v.end(); ++it) { - (*object)[it.key()] = it.value(); - } - object_ = std::move(object); - } else if (v.is_array()) { - auto array = std::make_shared(); - for (const auto& item : v) { - array->push_back(Value(item)); - } - array_ = array; - } else { - primitive_ = v; - } - } - - std::vector keys() { - if (!object_) throw std::runtime_error("Value is not an object: " + dump()); - std::vector res; - for (const auto& item : *object_) { - res.push_back(item.first); - } - return res; - } - - size_t size() const { - if (is_object()) return object_->size(); - if (is_array()) return array_->size(); - if (is_string()) return primitive_.get().length(); - throw std::runtime_error("Value is not an array or object: " + dump()); - } - - static Value array(const std::vector values = {}) { - auto array = std::make_shared(); - for (const auto& item : values) { - array->push_back(item); - } - return Value(array); - } - static Value object(const std::shared_ptr object = std::make_shared()) { - return Value(object); - } - static Value callable(const CallableType & callable) { - return Value(std::make_shared(callable)); - } - - void insert(size_t index, const Value& v) { - if (!array_) - throw std::runtime_error("Value is not an array: " + dump()); - array_->insert(array_->begin() + index, v); - } - void push_back(const Value& v) { - if (!array_) - throw std::runtime_error("Value is not an array: " + dump()); - array_->push_back(v); - } - Value pop(const Value& index) { - if (is_array()) { - if (array_->empty()) - throw std::runtime_error("pop from empty list"); - if (index.is_null()) { - auto ret = array_->back(); - array_->pop_back(); - return ret; - } else if (!index.is_number_integer()) { - throw std::runtime_error("pop index must be an integer: " + index.dump()); - } else { - auto i = index.get(); - if (i < 0 || i >= static_cast(array_->size())) - throw std::runtime_error("pop index out of range: " + index.dump()); - auto it = array_->begin() + (i < 0 ? array_->size() + i : i); - auto ret = *it; - array_->erase(it); - return ret; - } - } else if (is_object()) { - if (!index.is_hashable()) - throw std::runtime_error("Unashable type: " + index.dump()); - auto it = object_->find(index.primitive_); - if (it == object_->end()) - throw std::runtime_error("Key not found: " + index.dump()); - auto ret = it->second; - object_->erase(it); - return ret; - } else { - throw std::runtime_error("Value is not an array or object: " + dump()); - } - } - Value get(const Value& key) { - if (array_) { - if (!key.is_number_integer()) { - return Value(); - } - auto index = key.get(); - return array_->at(index < 0 ? array_->size() + index : index); - } else if (object_) { - if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump()); - auto it = object_->find(key.primitive_); - if (it == object_->end()) return Value(); - return it->second; - } - return Value(); - } - void set(const Value& key, const Value& value) { - if (!object_) throw std::runtime_error("Value is not an object: " + dump()); - if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump()); - (*object_)[key.primitive_] = value; - } - Value call(const std::shared_ptr & context, ArgumentsValue & args) const { - if (!callable_) throw std::runtime_error("Value is not callable: " + dump()); - return (*callable_)(context, args); - } - - bool is_object() const { return !!object_; } - bool is_array() const { return !!array_; } - bool is_callable() const { return !!callable_; } - bool is_null() const { return !object_ && !array_ && primitive_.is_null() && !callable_; } - bool is_boolean() const { return primitive_.is_boolean(); } - bool is_number_integer() const { return primitive_.is_number_integer(); } - bool is_number_float() const { return primitive_.is_number_float(); } - bool is_number() const { return primitive_.is_number(); } - bool is_string() const { return primitive_.is_string(); } - bool is_iterable() const { return is_array() || is_object() || is_string(); } - - bool is_primitive() const { return !array_ && !object_ && !callable_; } - bool is_hashable() const { return is_primitive(); } - - bool empty() const { - if (is_null()) - throw std::runtime_error("Undefined value or reference"); - if (is_string()) return primitive_.empty(); - if (is_array()) return array_->empty(); - if (is_object()) return object_->empty(); - return false; - } - - void for_each(const std::function & callback) const { - if (is_null()) - throw std::runtime_error("Undefined value or reference"); - if (array_) { - for (auto& item : *array_) { - callback(item); - } - } else if (object_) { - for (auto & item : *object_) { - Value key(item.first); - callback(key); - } - } else if (is_string()) { - for (char c : primitive_.get()) { - auto val = Value(std::string(1, c)); - callback(val); - } - } else { - throw std::runtime_error("Value is not iterable: " + dump()); - } - } - - bool to_bool() const { - if (is_null()) return false; - if (is_boolean()) return get(); - if (is_number()) return get() != 0; - if (is_string()) return !get().empty(); - if (is_array()) return !empty(); - return true; - } - - int64_t to_int() const { - if (is_null()) return 0; - if (is_boolean()) return get() ? 1 : 0; - if (is_number()) return static_cast(get()); - if (is_string()) { - try { - return std::stol(get()); - } catch (const std::exception &) { - return 0; - } - } - return 0; - } - - bool operator<(const Value & other) const { - if (is_null()) - throw std::runtime_error("Undefined value or reference"); - if (is_number() && other.is_number()) return get() < other.get(); - if (is_string() && other.is_string()) return get() < other.get(); - throw std::runtime_error("Cannot compare values: " + dump() + " < " + other.dump()); - } - bool operator>=(const Value & other) const { return !(*this < other); } - - bool operator>(const Value & other) const { - if (is_null()) - throw std::runtime_error("Undefined value or reference"); - if (is_number() && other.is_number()) return get() > other.get(); - if (is_string() && other.is_string()) return get() > other.get(); - throw std::runtime_error("Cannot compare values: " + dump() + " > " + other.dump()); - } - bool operator<=(const Value & other) const { return !(*this > other); } - - bool operator==(const Value & other) const { - if (callable_ || other.callable_) { - if (callable_.get() != other.callable_.get()) return false; - } - if (array_) { - if (!other.array_) return false; - if (array_->size() != other.array_->size()) return false; - for (size_t i = 0; i < array_->size(); ++i) { - if (!(*array_)[i].to_bool() || !(*other.array_)[i].to_bool() || (*array_)[i] != (*other.array_)[i]) return false; - } - return true; - } else if (object_) { - if (!other.object_) return false; - if (object_->size() != other.object_->size()) return false; - for (const auto& item : *object_) { - if (!item.second.to_bool() || !other.object_->count(item.first) || item.second != other.object_->at(item.first)) return false; - } - return true; - } else { - return primitive_ == other.primitive_; - } - } - bool operator!=(const Value & other) const { return !(*this == other); } - - bool contains(const char * key) const { return contains(std::string(key)); } - bool contains(const std::string & key) const { - if (array_) { - return false; - } else if (object_) { - return object_->find(key) != object_->end(); - } else { - throw std::runtime_error("contains can only be called on arrays and objects: " + dump()); - } - } - bool contains(const Value & value) const { - if (is_null()) - throw std::runtime_error("Undefined value or reference"); - if (array_) { - for (const auto& item : *array_) { - if (item.to_bool() && item == value) return true; - } - return false; - } else if (object_) { - if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump()); - return object_->find(value.primitive_) != object_->end(); - } else { - throw std::runtime_error("contains can only be called on arrays and objects: " + dump()); - } - } - void erase(size_t index) { - if (!array_) throw std::runtime_error("Value is not an array: " + dump()); - array_->erase(array_->begin() + index); - } - void erase(const std::string & key) { - if (!object_) throw std::runtime_error("Value is not an object: " + dump()); - object_->erase(key); - } - const Value& at(const Value & index) const { - return const_cast(this)->at(index); - } - Value& at(const Value & index) { - if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump()); - if (is_array()) return array_->at(index.get()); - if (is_object()) return object_->at(index.primitive_); - throw std::runtime_error("Value is not an array or object: " + dump()); - } - const Value& at(size_t index) const { - return const_cast(this)->at(index); - } - Value& at(size_t index) { - if (is_null()) - throw std::runtime_error("Undefined value or reference"); - if (is_array()) return array_->at(index); - if (is_object()) return object_->at(index); - throw std::runtime_error("Value is not an array or object: " + dump()); - } - - template - T get(const std::string & key, T default_value) const { - if (!contains(key)) return default_value; - return at(key).get(); - } - - template - T get() const { - if (is_primitive()) return primitive_.get(); - throw std::runtime_error("get not defined for this value type: " + dump()); - } - - std::string dump(int indent=-1, bool to_json=false) const { - std::ostringstream out; - dump(out, indent, 0, to_json); - return out.str(); - } - - Value operator-() const { - if (is_number_integer()) - return -get(); - else - return -get(); - } - std::string to_str() const { - if (is_string()) return get(); - if (is_number_integer()) return std::to_string(get()); - if (is_number_float()) return std::to_string(get()); - if (is_boolean()) return get() ? "True" : "False"; - if (is_null()) return "None"; - return dump(); - } - Value operator+(const Value& rhs) const { - if (is_string() || rhs.is_string()) { - return to_str() + rhs.to_str(); - } else if (is_number_integer() && rhs.is_number_integer()) { - return get() + rhs.get(); - } else if (is_array() && rhs.is_array()) { - auto res = Value::array(); - for (const auto& item : *array_) res.push_back(item); - for (const auto& item : *rhs.array_) res.push_back(item); - return res; - } else { - return get() + rhs.get(); - } - } - Value operator-(const Value& rhs) const { - if (is_number_integer() && rhs.is_number_integer()) - return get() - rhs.get(); - else - return get() - rhs.get(); - } - Value operator*(const Value& rhs) const { - if (is_string() && rhs.is_number_integer()) { - std::ostringstream out; - for (int64_t i = 0, n = rhs.get(); i < n; ++i) { - out << to_str(); - } - return out.str(); - } - else if (is_number_integer() && rhs.is_number_integer()) - return get() * rhs.get(); - else - return get() * rhs.get(); - } - Value operator/(const Value& rhs) const { - if (is_number_integer() && rhs.is_number_integer()) - return get() / rhs.get(); - else - return get() / rhs.get(); - } - Value operator%(const Value& rhs) const { - return get() % rhs.get(); - } -}; - -struct ArgumentsValue { - std::vector args; - std::vector> kwargs; - - bool has_named(const std::string & name) { - for (const auto & p : kwargs) { - if (p.first == name) return true; - } - return false; - } - - Value get_named(const std::string & name) { - for (const auto & [key, value] : kwargs) { - if (key == name) return value; - } - return Value(); - } - - bool empty() { - return args.empty() && kwargs.empty(); - } - - void expectArgs(const std::string & method_name, const std::pair & pos_count, const std::pair & kw_count) { - if (args.size() < pos_count.first || args.size() > pos_count.second || kwargs.size() < kw_count.first || kwargs.size() > kw_count.second) { - std::ostringstream out; - out << method_name << " must have between " << pos_count.first << " and " << pos_count.second << " positional arguments and between " << kw_count.first << " and " << kw_count.second << " keyword arguments"; - throw std::runtime_error(out.str()); - } - } -}; - -template <> -inline json Value::get() const { - if (is_primitive()) return primitive_; - if (is_null()) return json(); - if (array_) { - std::vector res; - for (const auto& item : *array_) { - res.push_back(item.get()); - } - return res; - } - if (object_) { - json res = json::object(); - for (const auto& [key, value] : *object_) { - if (key.is_string()) { - res[key.get()] = value.get(); - } else if (key.is_primitive()) { - res[key.dump()] = value.get(); - } else { - throw std::runtime_error("Invalid key type for conversion to JSON: " + key.dump()); - } - } - if (is_callable()) { - res["__callable__"] = true; - } - return res; - } - throw std::runtime_error("get not defined for this value type: " + dump()); -} - -} // namespace minja - -namespace std { - template <> - struct hash { - size_t operator()(const minja::Value & v) const { - if (!v.is_hashable()) - throw std::runtime_error("Unsupported type for hashing: " + v.dump()); - return std::hash()(v.get()); - } - }; -} // namespace std - -namespace minja { - -static std::string error_location_suffix(const std::string & source, size_t pos) { - auto get_line = [&](size_t line) { - auto start = source.begin(); - for (size_t i = 1; i < line; ++i) { - start = std::find(start, source.end(), '\n') + 1; - } - auto end = std::find(start, source.end(), '\n'); - return std::string(start, end); - }; - auto start = source.begin(); - auto end = source.end(); - auto it = start + pos; - auto line = std::count(start, it, '\n') + 1; - auto max_line = std::count(start, end, '\n') + 1; - auto col = pos - std::string(start, it).rfind('\n'); - std::ostringstream out; - out << " at row " << line << ", column " << col << ":\n"; - if (line > 1) out << get_line(line - 1) << "\n"; - out << get_line(line) << "\n"; - out << std::string(col - 1, ' ') << "^\n"; - if (line < max_line) out << get_line(line + 1) << "\n"; - - return out.str(); -} - -class Context : public std::enable_shared_from_this { - protected: - Value values_; - std::shared_ptr parent_; - public: - Context(Value && values, const std::shared_ptr & parent = nullptr) : values_(std::move(values)), parent_(parent) { - if (!values_.is_object()) throw std::runtime_error("Context values must be an object: " + values_.dump()); - } - virtual ~Context() {} - - static std::shared_ptr builtins(); - static std::shared_ptr make(Value && values, const std::shared_ptr & parent = builtins()); - - std::vector keys() { - return values_.keys(); - } - virtual Value get(const Value & key) { - if (values_.contains(key)) return values_.at(key); - if (parent_) return parent_->get(key); - return Value(); - } - virtual Value & at(const Value & key) { - if (values_.contains(key)) return values_.at(key); - if (parent_) return parent_->at(key); - throw std::runtime_error("Undefined variable: " + key.dump()); - } - virtual bool contains(const Value & key) { - if (values_.contains(key)) return true; - if (parent_) return parent_->contains(key); - return false; - } - virtual void set(const Value & key, const Value & value) { - values_.set(key, value); - } -}; - -struct Location { - std::shared_ptr source; - size_t pos; -}; - -class Expression { -protected: - virtual Value do_evaluate(const std::shared_ptr & context) const = 0; -public: - using Parameters = std::vector>>; - - Location location; - - Expression(const Location & location) : location(location) {} - virtual ~Expression() = default; - - Value evaluate(const std::shared_ptr & context) const { - try { - return do_evaluate(context); - } catch (const std::exception & e) { - std::ostringstream out; - out << e.what(); - if (location.source) out << error_location_suffix(*location.source, location.pos); - throw std::runtime_error(out.str()); - } - } -}; - -class VariableExpr : public Expression { - std::string name; -public: - VariableExpr(const Location & location, const std::string& n) - : Expression(location), name(n) {} - std::string get_name() const { return name; } - Value do_evaluate(const std::shared_ptr & context) const override { - if (!context->contains(name)) { - return Value(); - } - return context->at(name); - } -}; - -static void destructuring_assign(const std::vector & var_names, const std::shared_ptr & context, Value& item) { - if (var_names.size() == 1) { - Value name(var_names[0]); - context->set(name, item); - } else { - if (!item.is_array() || item.size() != var_names.size()) { - throw std::runtime_error("Mismatched number of variables and items in destructuring assignment"); - } - for (size_t i = 0; i < var_names.size(); ++i) { - context->set(var_names[i], item.at(i)); - } - } -} - -enum SpaceHandling { Keep, Strip, StripSpaces, StripNewline }; - -class TemplateToken { -public: - enum class Type { Text, Expression, If, Else, Elif, EndIf, For, EndFor, Generation, EndGeneration, Set, EndSet, Comment, Macro, EndMacro, Filter, EndFilter, Break, Continue }; - - static std::string typeToString(Type t) { - switch (t) { - case Type::Text: return "text"; - case Type::Expression: return "expression"; - case Type::If: return "if"; - case Type::Else: return "else"; - case Type::Elif: return "elif"; - case Type::EndIf: return "endif"; - case Type::For: return "for"; - case Type::EndFor: return "endfor"; - case Type::Set: return "set"; - case Type::EndSet: return "endset"; - case Type::Comment: return "comment"; - case Type::Macro: return "macro"; - case Type::EndMacro: return "endmacro"; - case Type::Filter: return "filter"; - case Type::EndFilter: return "endfilter"; - case Type::Generation: return "generation"; - case Type::EndGeneration: return "endgeneration"; - case Type::Break: return "break"; - case Type::Continue: return "continue"; - } - return "Unknown"; - } - - TemplateToken(Type type, const Location & location, SpaceHandling pre, SpaceHandling post) : type(type), location(location), pre_space(pre), post_space(post) {} - virtual ~TemplateToken() = default; - - Type type; - Location location; - SpaceHandling pre_space = SpaceHandling::Keep; - SpaceHandling post_space = SpaceHandling::Keep; -}; - -struct TextTemplateToken : public TemplateToken { - std::string text; - TextTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, location, pre, post), text(t) {} -}; - -struct ExpressionTemplateToken : public TemplateToken { - std::shared_ptr expr; - ExpressionTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr && e) : TemplateToken(Type::Expression, location, pre, post), expr(std::move(e)) {} -}; - -struct IfTemplateToken : public TemplateToken { - std::shared_ptr condition; - IfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr && c) : TemplateToken(Type::If, location, pre, post), condition(std::move(c)) {} -}; - -struct ElifTemplateToken : public TemplateToken { - std::shared_ptr condition; - ElifTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr && c) : TemplateToken(Type::Elif, location, pre, post), condition(std::move(c)) {} -}; - -struct ElseTemplateToken : public TemplateToken { - ElseTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, location, pre, post) {} -}; - -struct EndIfTemplateToken : public TemplateToken { - EndIfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, location, pre, post) {} -}; - -struct MacroTemplateToken : public TemplateToken { - std::shared_ptr name; - Expression::Parameters params; - MacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr && n, Expression::Parameters && p) - : TemplateToken(Type::Macro, location, pre, post), name(std::move(n)), params(std::move(p)) {} -}; - -struct EndMacroTemplateToken : public TemplateToken { - EndMacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, location, pre, post) {} -}; - -struct FilterTemplateToken : public TemplateToken { - std::shared_ptr filter; - FilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr && filter) - : TemplateToken(Type::Filter, location, pre, post), filter(std::move(filter)) {} -}; - -struct EndFilterTemplateToken : public TemplateToken { - EndFilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, location, pre, post) {} -}; - -struct ForTemplateToken : public TemplateToken { - std::vector var_names; - std::shared_ptr iterable; - std::shared_ptr condition; - bool recursive; - ForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::vector & vns, std::shared_ptr && iter, - std::shared_ptr && c, bool r) - : TemplateToken(Type::For, location, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {} -}; - -struct EndForTemplateToken : public TemplateToken { - EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {} -}; - -struct GenerationTemplateToken : public TemplateToken { - GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {} -}; - -struct EndGenerationTemplateToken : public TemplateToken { - EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {} -}; - -struct SetTemplateToken : public TemplateToken { - std::string ns; - std::vector var_names; - std::shared_ptr value; - SetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector & vns, std::shared_ptr && v) - : TemplateToken(Type::Set, location, pre, post), ns(ns), var_names(vns), value(std::move(v)) {} -}; - -struct EndSetTemplateToken : public TemplateToken { - EndSetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, location, pre, post) {} -}; - -struct CommentTemplateToken : public TemplateToken { - std::string text; - CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {} -}; - -enum class LoopControlType { Break, Continue }; - -class LoopControlException : public std::runtime_error { -public: - LoopControlType control_type; - LoopControlException(const std::string & message, LoopControlType control_type) : std::runtime_error(message), control_type(control_type) {} - LoopControlException(LoopControlType control_type) - : std::runtime_error((control_type == LoopControlType::Continue ? "continue" : "break") + std::string(" outside of a loop")), - control_type(control_type) {} -}; - -struct LoopControlTemplateToken : public TemplateToken { - LoopControlType control_type; - LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {} -}; - -class TemplateNode { - Location location_; -protected: - virtual void do_render(std::ostringstream & out, const std::shared_ptr & context) const = 0; - -public: - TemplateNode(const Location & location) : location_(location) {} - void render(std::ostringstream & out, const std::shared_ptr & context) const { - try { - do_render(out, context); - } catch (const LoopControlException & e) { - // TODO: make stack creation lazy. Only needed if it was thrown outside of a loop. - std::ostringstream err; - err << e.what(); - if (location_.source) err << error_location_suffix(*location_.source, location_.pos); - throw LoopControlException(err.str(), e.control_type); - } catch (const std::exception & e) { - std::ostringstream err; - err << e.what(); - if (location_.source) err << error_location_suffix(*location_.source, location_.pos); - throw std::runtime_error(err.str()); - } - } - const Location & location() const { return location_; } - virtual ~TemplateNode() = default; - std::string render(const std::shared_ptr & context) const { - std::ostringstream out; - render(out, context); - return out.str(); - } -}; - -class SequenceNode : public TemplateNode { - std::vector> children; -public: - SequenceNode(const Location & location, std::vector> && c) - : TemplateNode(location), children(std::move(c)) {} - void do_render(std::ostringstream & out, const std::shared_ptr & context) const override { - for (const auto& child : children) child->render(out, context); - } -}; - -class TextNode : public TemplateNode { - std::string text; -public: - TextNode(const Location & location, const std::string& t) : TemplateNode(location), text(t) {} - void do_render(std::ostringstream & out, const std::shared_ptr &) const override { - out << text; - } -}; - -class ExpressionNode : public TemplateNode { - std::shared_ptr expr; -public: - ExpressionNode(const Location & location, std::shared_ptr && e) : TemplateNode(location), expr(std::move(e)) {} - void do_render(std::ostringstream & out, const std::shared_ptr & context) const override { - if (!expr) throw std::runtime_error("ExpressionNode.expr is null"); - auto result = expr->evaluate(context); - if (result.is_string()) { - out << result.get(); - } else if (result.is_boolean()) { - out << (result.get() ? "True" : "False"); - } else if (!result.is_null()) { - out << result.dump(); - } - } -}; - -class IfNode : public TemplateNode { - std::vector, std::shared_ptr>> cascade; -public: - IfNode(const Location & location, std::vector, std::shared_ptr>> && c) - : TemplateNode(location), cascade(std::move(c)) {} - void do_render(std::ostringstream & out, const std::shared_ptr & context) const override { - for (const auto& branch : cascade) { - auto enter_branch = true; - if (branch.first) { - enter_branch = branch.first->evaluate(context).to_bool(); - } - if (enter_branch) { - if (!branch.second) throw std::runtime_error("IfNode.cascade.second is null"); - branch.second->render(out, context); - return; - } - } - } -}; - -class LoopControlNode : public TemplateNode { - LoopControlType control_type_; - public: - LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {} - void do_render(std::ostringstream &, const std::shared_ptr &) const override { - throw LoopControlException(control_type_); - } -}; - -class ForNode : public TemplateNode { - std::vector var_names; - std::shared_ptr iterable; - std::shared_ptr condition; - std::shared_ptr body; - bool recursive; - std::shared_ptr else_body; -public: - ForNode(const Location & location, std::vector && var_names, std::shared_ptr && iterable, - std::shared_ptr && condition, std::shared_ptr && body, bool recursive, std::shared_ptr && else_body) - : TemplateNode(location), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {} - - void do_render(std::ostringstream & out, const std::shared_ptr & context) const override { - // https://jinja.palletsprojects.com/en/3.0.x/templates/#for - if (!iterable) throw std::runtime_error("ForNode.iterable is null"); - if (!body) throw std::runtime_error("ForNode.body is null"); - - auto iterable_value = iterable->evaluate(context); - Value::CallableType loop_function; - - std::function visit = [&](Value& iter) { - auto filtered_items = Value::array(); - if (!iter.is_null()) { - if (!iterable_value.is_iterable()) { - throw std::runtime_error("For loop iterable must be iterable: " + iterable_value.dump()); - } - iterable_value.for_each([&](Value & item) { - destructuring_assign(var_names, context, item); - if (!condition || condition->evaluate(context).to_bool()) { - filtered_items.push_back(item); - } - }); - } - if (filtered_items.empty()) { - if (else_body) { - else_body->render(out, context); - } - } else { - auto loop = recursive ? Value::callable(loop_function) : Value::object(); - loop.set("length", (int64_t) filtered_items.size()); - - size_t cycle_index = 0; - loop.set("cycle", Value::callable([&](const std::shared_ptr &, ArgumentsValue & args) { - if (args.args.empty() || !args.kwargs.empty()) { - throw std::runtime_error("cycle() expects at least 1 positional argument and no named arg"); - } - auto item = args.args[cycle_index]; - cycle_index = (cycle_index + 1) % args.args.size(); - return item; - })); - auto loop_context = Context::make(Value::object(), context); - loop_context->set("loop", loop); - for (size_t i = 0, n = filtered_items.size(); i < n; ++i) { - auto & item = filtered_items.at(i); - destructuring_assign(var_names, loop_context, item); - loop.set("index", (int64_t) i + 1); - loop.set("index0", (int64_t) i); - loop.set("revindex", (int64_t) (n - i)); - loop.set("revindex0", (int64_t) (n - i - 1)); - loop.set("length", (int64_t) n); - loop.set("first", i == 0); - loop.set("last", i == (n - 1)); - loop.set("previtem", i > 0 ? filtered_items.at(i - 1) : Value()); - loop.set("nextitem", i < n - 1 ? filtered_items.at(i + 1) : Value()); - try { - body->render(out, loop_context); - } catch (const LoopControlException & e) { - if (e.control_type == LoopControlType::Break) break; - if (e.control_type == LoopControlType::Continue) continue; - } - } - } - }; - - if (recursive) { - loop_function = [&](const std::shared_ptr &, ArgumentsValue & args) { - if (args.args.size() != 1 || !args.kwargs.empty() || !args.args[0].is_array()) { - throw std::runtime_error("loop() expects exactly 1 positional iterable argument"); - } - auto & items = args.args[0]; - visit(items); - return Value(); - }; - } - - visit(iterable_value); - } -}; - -class MacroNode : public TemplateNode { - std::shared_ptr name; - Expression::Parameters params; - std::shared_ptr body; - std::unordered_map named_param_positions; -public: - MacroNode(const Location & location, std::shared_ptr && n, Expression::Parameters && p, std::shared_ptr && b) - : TemplateNode(location), name(std::move(n)), params(std::move(p)), body(std::move(b)) { - for (size_t i = 0; i < params.size(); ++i) { - const auto & name = params[i].first; - if (!name.empty()) { - named_param_positions[name] = i; - } - } - } - void do_render(std::ostringstream &, const std::shared_ptr & macro_context) const override { - if (!name) throw std::runtime_error("MacroNode.name is null"); - if (!body) throw std::runtime_error("MacroNode.body is null"); - auto callable = Value::callable([&](const std::shared_ptr & context, ArgumentsValue & args) { - auto call_context = macro_context; - std::vector param_set(params.size(), false); - for (size_t i = 0, n = args.args.size(); i < n; i++) { - auto & arg = args.args[i]; - if (i >= params.size()) throw std::runtime_error("Too many positional arguments for macro " + name->get_name()); - param_set[i] = true; - auto & param_name = params[i].first; - call_context->set(param_name, arg); - } - for (auto & [arg_name, value] : args.kwargs) { - auto it = named_param_positions.find(arg_name); - if (it == named_param_positions.end()) throw std::runtime_error("Unknown parameter name for macro " + name->get_name() + ": " + arg_name); - - call_context->set(arg_name, value); - param_set[it->second] = true; - } - // Set default values for parameters that were not passed - for (size_t i = 0, n = params.size(); i < n; i++) { - if (!param_set[i] && params[i].second != nullptr) { - auto val = params[i].second->evaluate(context); - call_context->set(params[i].first, val); - } - } - return body->render(call_context); - }); - macro_context->set(name->get_name(), callable); - } -}; - -class FilterNode : public TemplateNode { - std::shared_ptr filter; - std::shared_ptr body; - -public: - FilterNode(const Location & location, std::shared_ptr && f, std::shared_ptr && b) - : TemplateNode(location), filter(std::move(f)), body(std::move(b)) {} - - void do_render(std::ostringstream & out, const std::shared_ptr & context) const override { - if (!filter) throw std::runtime_error("FilterNode.filter is null"); - if (!body) throw std::runtime_error("FilterNode.body is null"); - auto filter_value = filter->evaluate(context); - if (!filter_value.is_callable()) { - throw std::runtime_error("Filter must be a callable: " + filter_value.dump()); - } - std::string rendered_body = body->render(context); - - ArgumentsValue filter_args = {{Value(rendered_body)}, {}}; - auto result = filter_value.call(context, filter_args); - out << result.to_str(); - } -}; - -class SetNode : public TemplateNode { - std::string ns; - std::vector var_names; - std::shared_ptr value; -public: - SetNode(const Location & location, const std::string & ns, const std::vector & vns, std::shared_ptr && v) - : TemplateNode(location), ns(ns), var_names(vns), value(std::move(v)) {} - void do_render(std::ostringstream &, const std::shared_ptr & context) const override { - if (!value) throw std::runtime_error("SetNode.value is null"); - if (!ns.empty()) { - if (var_names.size() != 1) { - throw std::runtime_error("Namespaced set only supports a single variable name"); - } - auto & name = var_names[0]; - auto ns_value = context->get(ns); - if (!ns_value.is_object()) throw std::runtime_error("Namespace '" + ns + "' is not an object"); - ns_value.set(name, this->value->evaluate(context)); - } else { - auto val = value->evaluate(context); - destructuring_assign(var_names, context, val); - } - } -}; - -class SetTemplateNode : public TemplateNode { - std::string name; - std::shared_ptr template_value; -public: - SetTemplateNode(const Location & location, const std::string & name, std::shared_ptr && tv) - : TemplateNode(location), name(name), template_value(std::move(tv)) {} - void do_render(std::ostringstream &, const std::shared_ptr & context) const override { - if (!template_value) throw std::runtime_error("SetTemplateNode.template_value is null"); - Value value { template_value->render(context) }; - context->set(name, value); - } -}; - -class IfExpr : public Expression { - std::shared_ptr condition; - std::shared_ptr then_expr; - std::shared_ptr else_expr; -public: - IfExpr(const Location & location, std::shared_ptr && c, std::shared_ptr && t, std::shared_ptr && e) - : Expression(location), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - if (!condition) throw std::runtime_error("IfExpr.condition is null"); - if (!then_expr) throw std::runtime_error("IfExpr.then_expr is null"); - if (condition->evaluate(context).to_bool()) { - return then_expr->evaluate(context); - } - if (else_expr) { - return else_expr->evaluate(context); - } - return nullptr; - } -}; - -class LiteralExpr : public Expression { - Value value; -public: - LiteralExpr(const Location & location, const Value& v) - : Expression(location), value(v) {} - Value do_evaluate(const std::shared_ptr &) const override { return value; } -}; - -class ArrayExpr : public Expression { - std::vector> elements; -public: - ArrayExpr(const Location & location, std::vector> && e) - : Expression(location), elements(std::move(e)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - auto result = Value::array(); - for (const auto& e : elements) { - if (!e) throw std::runtime_error("Array element is null"); - result.push_back(e->evaluate(context)); - } - return result; - } -}; - -class DictExpr : public Expression { - std::vector, std::shared_ptr>> elements; -public: - DictExpr(const Location & location, std::vector, std::shared_ptr>> && e) - : Expression(location), elements(std::move(e)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - auto result = Value::object(); - for (const auto& [key, value] : elements) { - if (!key) throw std::runtime_error("Dict key is null"); - if (!value) throw std::runtime_error("Dict value is null"); - result.set(key->evaluate(context), value->evaluate(context)); - } - return result; - } -}; - -class SliceExpr : public Expression { -public: - std::shared_ptr start, end; - SliceExpr(const Location & location, std::shared_ptr && s, std::shared_ptr && e) - : Expression(location), start(std::move(s)), end(std::move(e)) {} - Value do_evaluate(const std::shared_ptr &) const override { - throw std::runtime_error("SliceExpr not implemented"); - } -}; - -class SubscriptExpr : public Expression { - std::shared_ptr base; - std::shared_ptr index; -public: - SubscriptExpr(const Location & location, std::shared_ptr && b, std::shared_ptr && i) - : Expression(location), base(std::move(b)), index(std::move(i)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - if (!base) throw std::runtime_error("SubscriptExpr.base is null"); - if (!index) throw std::runtime_error("SubscriptExpr.index is null"); - auto target_value = base->evaluate(context); - if (auto slice = dynamic_cast(index.get())) { - auto start = slice->start ? slice->start->evaluate(context).get() : 0; - auto end = slice->end ? slice->end->evaluate(context).get() : (int64_t) target_value.size(); - if (target_value.is_string()) { - std::string s = target_value.get(); - if (start < 0) start = s.size() + start; - if (end < 0) end = s.size() + end; - return s.substr(start, end - start); - } else if (target_value.is_array()) { - if (start < 0) start = target_value.size() + start; - if (end < 0) end = target_value.size() + end; - auto result = Value::array(); - for (auto i = start; i < end; ++i) { - result.push_back(target_value.at(i)); - } - return result; - } else { - throw std::runtime_error(target_value.is_null() ? "Cannot subscript null" : "Subscripting only supported on arrays and strings"); - } - } else { - auto index_value = index->evaluate(context); - if (target_value.is_null()) { - if (auto t = dynamic_cast(base.get())) { - throw std::runtime_error("'" + t->get_name() + "' is " + (context->contains(t->get_name()) ? "null" : "not defined")); - } - throw std::runtime_error("Trying to access property '" + index_value.dump() + "' on null!"); - } - return target_value.get(index_value); - } - } -}; - -class UnaryOpExpr : public Expression { -public: - enum class Op { Plus, Minus, LogicalNot, Expansion, ExpansionDict }; - std::shared_ptr expr; - Op op; - UnaryOpExpr(const Location & location, std::shared_ptr && e, Op o) - : Expression(location), expr(std::move(e)), op(o) {} - Value do_evaluate(const std::shared_ptr & context) const override { - if (!expr) throw std::runtime_error("UnaryOpExpr.expr is null"); - auto e = expr->evaluate(context); - switch (op) { - case Op::Plus: return e; - case Op::Minus: return -e; - case Op::LogicalNot: return !e.to_bool(); - case Op::Expansion: - case Op::ExpansionDict: - throw std::runtime_error("Expansion operator is only supported in function calls and collections"); - - } - throw std::runtime_error("Unknown unary operator"); - } -}; - -class BinaryOpExpr : public Expression { -public: - enum class Op { StrConcat, Add, Sub, Mul, MulMul, Div, DivDiv, Mod, Eq, Ne, Lt, Gt, Le, Ge, And, Or, In, NotIn, Is, IsNot }; -private: - std::shared_ptr left; - std::shared_ptr right; - Op op; -public: - BinaryOpExpr(const Location & location, std::shared_ptr && l, std::shared_ptr && r, Op o) - : Expression(location), left(std::move(l)), right(std::move(r)), op(o) {} - Value do_evaluate(const std::shared_ptr & context) const override { - if (!left) throw std::runtime_error("BinaryOpExpr.left is null"); - if (!right) throw std::runtime_error("BinaryOpExpr.right is null"); - auto l = left->evaluate(context); - - auto do_eval = [&](const Value & l) -> Value { - if (op == Op::Is || op == Op::IsNot) { - auto t = dynamic_cast(right.get()); - if (!t) throw std::runtime_error("Right side of 'is' operator must be a variable"); - - auto eval = [&]() { - const auto & name = t->get_name(); - if (name == "none") return l.is_null(); - if (name == "boolean") return l.is_boolean(); - if (name == "integer") return l.is_number_integer(); - if (name == "float") return l.is_number_float(); - if (name == "number") return l.is_number(); - if (name == "string") return l.is_string(); - if (name == "mapping") return l.is_object(); - if (name == "iterable") return l.is_iterable(); - if (name == "sequence") return l.is_array(); - if (name == "defined") return !l.is_null(); - throw std::runtime_error("Unknown type for 'is' operator: " + name); - }; - auto value = eval(); - return Value(op == Op::Is ? value : !value); - } - - if (op == Op::And) { - if (!l.to_bool()) return Value(false); - return right->evaluate(context).to_bool(); - } else if (op == Op::Or) { - if (l.to_bool()) return l; - return right->evaluate(context); - } - - auto r = right->evaluate(context); - switch (op) { - case Op::StrConcat: return l.to_str() + r.to_str(); - case Op::Add: return l + r; - case Op::Sub: return l - r; - case Op::Mul: return l * r; - case Op::Div: return l / r; - case Op::MulMul: return std::pow(l.get(), r.get()); - case Op::DivDiv: return l.get() / r.get(); - case Op::Mod: return l.get() % r.get(); - case Op::Eq: return l == r; - case Op::Ne: return l != r; - case Op::Lt: return l < r; - case Op::Gt: return l > r; - case Op::Le: return l <= r; - case Op::Ge: return l >= r; - case Op::In: return (r.is_array() || r.is_object()) && r.contains(l); - case Op::NotIn: return !(r.is_array() && r.contains(l)); - default: break; - } - throw std::runtime_error("Unknown binary operator"); - }; - - if (l.is_callable()) { - return Value::callable([l, do_eval](const std::shared_ptr & context, ArgumentsValue & args) { - auto ll = l.call(context, args); - return do_eval(ll); //args[0].second); - }); - } else { - return do_eval(l); - } - } -}; - -struct ArgumentsExpression { - std::vector> args; - std::vector>> kwargs; - - ArgumentsValue evaluate(const std::shared_ptr & context) const { - ArgumentsValue vargs; - for (const auto& arg : this->args) { - if (auto un_expr = std::dynamic_pointer_cast(arg)) { - if (un_expr->op == UnaryOpExpr::Op::Expansion) { - auto array = un_expr->expr->evaluate(context); - if (!array.is_array()) { - throw std::runtime_error("Expansion operator only supported on arrays"); - } - array.for_each([&](Value & value) { - vargs.args.push_back(value); - }); - continue; - } else if (un_expr->op == UnaryOpExpr::Op::ExpansionDict) { - auto dict = un_expr->expr->evaluate(context); - if (!dict.is_object()) { - throw std::runtime_error("ExpansionDict operator only supported on objects"); - } - dict.for_each([&](const Value & key) { - vargs.kwargs.push_back({key.get(), dict.at(key)}); - }); - continue; - } - } - vargs.args.push_back(arg->evaluate(context)); - } - for (const auto& [name, value] : this->kwargs) { - vargs.kwargs.push_back({name, value->evaluate(context)}); - } - return vargs; - } -}; - -static std::string strip(const std::string & s) { - auto start = s.find_first_not_of(" \t\n\r"); - if (start == std::string::npos) return ""; - auto end = s.find_last_not_of(" \t\n\r"); - return s.substr(start, end - start + 1); -} - -static std::string capitalize(const std::string & s) { - if (s.empty()) return s; - auto result = s; - result[0] = std::toupper(result[0]); - return result; -} - -static std::string html_escape(const std::string & s) { - std::string result; - result.reserve(s.size()); - for (const auto & c : s) { - switch (c) { - case '&': result += "&"; break; - case '<': result += "<"; break; - case '>': result += ">"; break; - case '"': result += """; break; - case '\'': result += "'"; break; - default: result += c; break; - } - } - return result; -} - -class MethodCallExpr : public Expression { - std::shared_ptr object; - std::shared_ptr method; - ArgumentsExpression args; -public: - MethodCallExpr(const Location & location, std::shared_ptr && obj, std::shared_ptr && m, ArgumentsExpression && a) - : Expression(location), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - if (!object) throw std::runtime_error("MethodCallExpr.object is null"); - if (!method) throw std::runtime_error("MethodCallExpr.method is null"); - auto obj = object->evaluate(context); - auto vargs = args.evaluate(context); - if (obj.is_null()) { - throw std::runtime_error("Trying to call method '" + method->get_name() + "' on null"); - } - if (obj.is_array()) { - if (method->get_name() == "append") { - vargs.expectArgs("append method", {1, 1}, {0, 0}); - obj.push_back(vargs.args[0]); - return Value(); - } else if (method->get_name() == "pop") { - vargs.expectArgs("pop method", {0, 1}, {0, 0}); - return obj.pop(vargs.args.empty() ? Value() : vargs.args[0]); - } else if (method->get_name() == "insert") { - vargs.expectArgs("insert method", {2, 2}, {0, 0}); - auto index = vargs.args[0].get(); - if (index < 0 || index > (int64_t) obj.size()) throw std::runtime_error("Index out of range for insert method"); - obj.insert(index, vargs.args[1]); - return Value(); - } - } else if (obj.is_object()) { - if (method->get_name() == "items") { - vargs.expectArgs("items method", {0, 0}, {0, 0}); - auto result = Value::array(); - for (const auto& key : obj.keys()) { - result.push_back(Value::array({key, obj.at(key)})); - } - return result; - } else if (method->get_name() == "pop") { - vargs.expectArgs("pop method", {1, 1}, {0, 0}); - return obj.pop(vargs.args[0]); - } else if (method->get_name() == "get") { - vargs.expectArgs("get method", {1, 2}, {0, 0}); - auto key = vargs.args[0]; - if (vargs.args.size() == 1) { - return obj.contains(key) ? obj.at(key) : Value(); - } else { - return obj.contains(key) ? obj.at(key) : vargs.args[1]; - } - } else if (obj.contains(method->get_name())) { - auto callable = obj.at(method->get_name()); - if (!callable.is_callable()) { - throw std::runtime_error("Property '" + method->get_name() + "' is not callable"); - } - return callable.call(context, vargs); - } - } else if (obj.is_string()) { - auto str = obj.get(); - if (method->get_name() == "strip") { - vargs.expectArgs("strip method", {0, 0}, {0, 0}); - return Value(strip(str)); - } else if (method->get_name() == "capitalize") { - vargs.expectArgs("capitalize method", {0, 0}, {0, 0}); - return Value(capitalize(str)); - } else if (method->get_name() == "endswith") { - vargs.expectArgs("endswith method", {1, 1}, {0, 0}); - auto suffix = vargs.args[0].get(); - return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin()); - } else if (method->get_name() == "title") { - vargs.expectArgs("title method", {0, 0}, {0, 0}); - auto res = str; - for (size_t i = 0, n = res.size(); i < n; ++i) { - if (i == 0 || std::isspace(res[i - 1])) res[i] = std::toupper(res[i]); - else res[i] = std::tolower(res[i]); - } - return res; - } - } - throw std::runtime_error("Unknown method: " + method->get_name()); - } -}; - -class CallExpr : public Expression { -public: - std::shared_ptr object; - ArgumentsExpression args; - CallExpr(const Location & location, std::shared_ptr && obj, ArgumentsExpression && a) - : Expression(location), object(std::move(obj)), args(std::move(a)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - if (!object) throw std::runtime_error("CallExpr.object is null"); - auto obj = object->evaluate(context); - if (!obj.is_callable()) { - throw std::runtime_error("Object is not callable: " + obj.dump(2)); - } - auto vargs = args.evaluate(context); - return obj.call(context, vargs); - } -}; - -class FilterExpr : public Expression { - std::vector> parts; -public: - FilterExpr(const Location & location, std::vector> && p) - : Expression(location), parts(std::move(p)) {} - Value do_evaluate(const std::shared_ptr & context) const override { - Value result; - bool first = true; - for (const auto& part : parts) { - if (!part) throw std::runtime_error("FilterExpr.part is null"); - if (first) { - first = false; - result = part->evaluate(context); - } else { - if (auto ce = dynamic_cast(part.get())) { - auto target = ce->object->evaluate(context); - ArgumentsValue args = ce->args.evaluate(context); - args.args.insert(args.args.begin(), result); - result = target.call(context, args); - } else { - auto callable = part->evaluate(context); - ArgumentsValue args; - args.args.insert(args.args.begin(), result); - result = callable.call(context, args); - } - } - } - return result; - } - - void prepend(std::shared_ptr && e) { - parts.insert(parts.begin(), std::move(e)); - } -}; - -class Parser { -private: - using CharIterator = std::string::const_iterator; - - std::shared_ptr template_str; - CharIterator start, end, it; - Options options; - - Parser(const std::shared_ptr& template_str, const Options & options) : template_str(template_str), options(options) { - if (!template_str) throw std::runtime_error("Template string is null"); - start = it = this->template_str->begin(); - end = this->template_str->end(); - } - - bool consumeSpaces(SpaceHandling space_handling = SpaceHandling::Strip) { - if (space_handling == SpaceHandling::Strip) { - while (it != end && std::isspace(*it)) ++it; - } - return true; - } - - std::unique_ptr parseString() { - auto doParse = [&](char quote) -> std::unique_ptr { - if (it == end || *it != quote) return nullptr; - std::string result; - bool escape = false; - for (++it; it != end; ++it) { - if (escape) { - escape = false; - switch (*it) { - case 'n': result += '\n'; break; - case 'r': result += '\r'; break; - case 't': result += '\t'; break; - case 'b': result += '\b'; break; - case 'f': result += '\f'; break; - case '\\': result += '\\'; break; - default: - if (*it == quote) { - result += quote; - } else { - result += *it; - } - break; - } - } else if (*it == '\\') { - escape = true; - } else if (*it == quote) { - ++it; - return std::make_unique(std::move(result)); - } else { - result += *it; - } - } - return nullptr; - }; - - consumeSpaces(); - if (it == end) return nullptr; - if (*it == '"') return doParse('"'); - if (*it == '\'') return doParse('\''); - return nullptr; - } - - json parseNumber(CharIterator& it, const CharIterator& end) { - auto before = it; - consumeSpaces(); - auto start = it; - bool hasDecimal = false; - bool hasExponent = false; - - if (it != end && (*it == '-' || *it == '+')) ++it; - - while (it != end) { - if (std::isdigit(*it)) { - ++it; - } else if (*it == '.') { - if (hasDecimal) throw std::runtime_error("Multiple decimal points"); - hasDecimal = true; - ++it; - } else if (it != start && (*it == 'e' || *it == 'E')) { - if (hasExponent) throw std::runtime_error("Multiple exponents"); - hasExponent = true; - ++it; - } else { - break; - } - } - if (start == it) { - it = before; - return json(); // No valid characters found - } - - std::string str(start, it); - try { - return json::parse(str); - } catch (json::parse_error& e) { - throw std::runtime_error("Failed to parse number: '" + str + "' (" + std::string(e.what()) + ")"); - return json(); - } - } - - /** integer, float, bool, string */ - std::shared_ptr parseConstant() { - auto start = it; - consumeSpaces(); - if (it == end) return nullptr; - if (*it == '"' || *it == '\'') { - auto str = parseString(); - if (str) return std::make_shared(*str); - } - static std::regex prim_tok(R"(true\b|True\b|false\b|False\b|None\b)"); - auto token = consumeToken(prim_tok); - if (!token.empty()) { - if (token == "true" || token == "True") return std::make_shared(true); - if (token == "false" || token == "False") return std::make_shared(false); - if (token == "None") return std::make_shared(nullptr); - throw std::runtime_error("Unknown constant token: " + token); - } - - auto number = parseNumber(it, end); - if (!number.is_null()) return std::make_shared(number); - - it = start; - return nullptr; - } - - class expression_parsing_error : public std::runtime_error { - const CharIterator it; - public: - expression_parsing_error(const std::string & message, const CharIterator & it) - : std::runtime_error(message), it(it) {} - size_t get_pos(const CharIterator & begin) const { - return std::distance(begin, it); - } - }; - - bool peekSymbols(const std::vector & symbols) const { - for (const auto & symbol : symbols) { - if (std::distance(it, end) >= (int64_t) symbol.size() && std::string(it, it + symbol.size()) == symbol) { - return true; - } - } - return false; - } - - std::vector consumeTokenGroups(const std::regex & regex, SpaceHandling space_handling = SpaceHandling::Strip) { - auto start = it; - consumeSpaces(space_handling); - std::smatch match; - if (std::regex_search(it, end, match, regex) && match.position() == 0) { - it += match[0].length(); - std::vector ret; - for (size_t i = 0, n = match.size(); i < n; ++i) { - ret.push_back(match[i].str()); - } - return ret; - } - it = start; - return {}; - } - std::string consumeToken(const std::regex & regex, SpaceHandling space_handling = SpaceHandling::Strip) { - auto start = it; - consumeSpaces(space_handling); - std::smatch match; - if (std::regex_search(it, end, match, regex) && match.position() == 0) { - it += match[0].length(); - return match[0].str(); - } - it = start; - return ""; - } - - std::string consumeToken(const std::string & token, SpaceHandling space_handling = SpaceHandling::Strip) { - auto start = it; - consumeSpaces(space_handling); - if (std::distance(it, end) >= (int64_t) token.size() && std::string(it, it + token.size()) == token) { - it += token.size(); - return token; - } - it = start; - return ""; - } - - std::shared_ptr parseExpression(bool allow_if_expr = true) { - auto left = parseLogicalOr(); - if (it == end) return left; - - if (!allow_if_expr) return left; - - static std::regex if_tok(R"(if\b)"); - if (consumeToken(if_tok).empty()) { - return left; - } - - auto location = get_location(); - auto [condition, else_expr] = parseIfExpression(); - return std::make_shared(location, std::move(condition), std::move(left), std::move(else_expr)); - } - - Location get_location() const { - return {template_str, (size_t) std::distance(start, it)}; - } - - std::pair, std::shared_ptr> parseIfExpression() { - auto condition = parseLogicalOr(); - if (!condition) throw std::runtime_error("Expected condition expression"); - - static std::regex else_tok(R"(else\b)"); - std::shared_ptr else_expr; - if (!consumeToken(else_tok).empty()) { - else_expr = parseExpression(); - if (!else_expr) throw std::runtime_error("Expected 'else' expression"); - } - return std::pair(std::move(condition), std::move(else_expr)); - } - - std::shared_ptr parseLogicalOr() { - auto left = parseLogicalAnd(); - if (!left) throw std::runtime_error("Expected left side of 'logical or' expression"); - - static std::regex or_tok(R"(or\b)"); - auto location = get_location(); - while (!consumeToken(or_tok).empty()) { - auto right = parseLogicalAnd(); - if (!right) throw std::runtime_error("Expected right side of 'or' expression"); - left = std::make_shared(location, std::move(left), std::move(right), BinaryOpExpr::Op::Or); - } - return left; - } - - std::shared_ptr parseLogicalNot() { - static std::regex not_tok(R"(not\b)"); - auto location = get_location(); - - if (!consumeToken(not_tok).empty()) { - auto sub = parseLogicalNot(); - if (!sub) throw std::runtime_error("Expected expression after 'not' keyword"); - return std::make_shared(location, std::move(sub), UnaryOpExpr::Op::LogicalNot); - } - return parseLogicalCompare(); - } - - std::shared_ptr parseLogicalAnd() { - auto left = parseLogicalNot(); - if (!left) throw std::runtime_error("Expected left side of 'logical and' expression"); - - static std::regex and_tok(R"(and\b)"); - auto location = get_location(); - while (!consumeToken(and_tok).empty()) { - auto right = parseLogicalNot(); - if (!right) throw std::runtime_error("Expected right side of 'and' expression"); - left = std::make_shared(location, std::move(left), std::move(right), BinaryOpExpr::Op::And); - } - return left; - } - - std::shared_ptr parseLogicalCompare() { - auto left = parseStringConcat(); - if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression"); - - static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)"); - static std::regex not_tok(R"(not\b)"); - std::string op_str; - while (!(op_str = consumeToken(compare_tok)).empty()) { - auto location = get_location(); - if (op_str == "is") { - auto negated = !consumeToken(not_tok).empty(); - - auto identifier = parseIdentifier(); - if (!identifier) throw std::runtime_error("Expected identifier after 'is' keyword"); - - return std::make_shared( - left->location, - std::move(left), std::move(identifier), - negated ? BinaryOpExpr::Op::IsNot : BinaryOpExpr::Op::Is); - } - auto right = parseStringConcat(); - if (!right) throw std::runtime_error("Expected right side of 'logical compare' expression"); - BinaryOpExpr::Op op; - if (op_str == "==") op = BinaryOpExpr::Op::Eq; - else if (op_str == "!=") op = BinaryOpExpr::Op::Ne; - else if (op_str == "<") op = BinaryOpExpr::Op::Lt; - else if (op_str == ">") op = BinaryOpExpr::Op::Gt; - else if (op_str == "<=") op = BinaryOpExpr::Op::Le; - else if (op_str == ">=") op = BinaryOpExpr::Op::Ge; - else if (op_str == "in") op = BinaryOpExpr::Op::In; - else if (op_str.substr(0, 3) == "not") op = BinaryOpExpr::Op::NotIn; - else throw std::runtime_error("Unknown comparison operator: " + op_str); - left = std::make_shared(get_location(), std::move(left), std::move(right), op); - } - return left; - } - - Expression::Parameters parseParameters() { - consumeSpaces(); - if (consumeToken("(").empty()) throw std::runtime_error("Expected opening parenthesis in param list"); - - Expression::Parameters result; - - while (it != end) { - if (!consumeToken(")").empty()) { - return result; - } - auto expr = parseExpression(); - if (!expr) throw std::runtime_error("Expected expression in call args"); - - if (auto ident = dynamic_cast(expr.get())) { - if (!consumeToken("=").empty()) { - auto value = parseExpression(); - if (!value) throw std::runtime_error("Expected expression in for named arg"); - result.emplace_back(ident->get_name(), std::move(value)); - } else { - result.emplace_back(ident->get_name(), nullptr); - } - } else { - result.emplace_back(std::string(), std::move(expr)); - } - if (consumeToken(",").empty()) { - if (consumeToken(")").empty()) { - throw std::runtime_error("Expected closing parenthesis in call args"); - } - return result; - } - } - throw std::runtime_error("Expected closing parenthesis in call args"); - } - - ArgumentsExpression parseCallArgs() { - consumeSpaces(); - if (consumeToken("(").empty()) throw std::runtime_error("Expected opening parenthesis in call args"); - - ArgumentsExpression result; - - while (it != end) { - if (!consumeToken(")").empty()) { - return result; - } - auto expr = parseExpression(); - if (!expr) throw std::runtime_error("Expected expression in call args"); - - if (auto ident = dynamic_cast(expr.get())) { - if (!consumeToken("=").empty()) { - auto value = parseExpression(); - if (!value) throw std::runtime_error("Expected expression in for named arg"); - result.kwargs.emplace_back(ident->get_name(), std::move(value)); - } else { - result.args.emplace_back(std::move(expr)); - } - } else { - result.args.emplace_back(std::move(expr)); - } - if (consumeToken(",").empty()) { - if (consumeToken(")").empty()) { - throw std::runtime_error("Expected closing parenthesis in call args"); - } - return result; - } - } - throw std::runtime_error("Expected closing parenthesis in call args"); - } - - std::shared_ptr parseIdentifier() { - static std::regex ident_regex(R"((?!(?:not|is|and|or|del)\b)[a-zA-Z_]\w*)"); - auto location = get_location(); - auto ident = consumeToken(ident_regex); - if (ident.empty()) - return nullptr; - return std::make_shared(location, ident); - } - - std::shared_ptr parseStringConcat() { - auto left = parseMathPow(); - if (!left) throw std::runtime_error("Expected left side of 'string concat' expression"); - - static std::regex concat_tok(R"(~(?!\}))"); - if (!consumeToken(concat_tok).empty()) { - auto right = parseLogicalAnd(); - if (!right) throw std::runtime_error("Expected right side of 'string concat' expression"); - left = std::make_shared(get_location(), std::move(left), std::move(right), BinaryOpExpr::Op::StrConcat); - } - return left; - } - - std::shared_ptr parseMathPow() { - auto left = parseMathPlusMinus(); - if (!left) throw std::runtime_error("Expected left side of 'math pow' expression"); - - while (!consumeToken("**").empty()) { - auto right = parseMathPlusMinus(); - if (!right) throw std::runtime_error("Expected right side of 'math pow' expression"); - left = std::make_shared(get_location(), std::move(left), std::move(right), BinaryOpExpr::Op::MulMul); - } - return left; - } - - std::shared_ptr parseMathPlusMinus() { - static std::regex plus_minus_tok(R"(\+|-(?![}%#]\}))"); - - auto left = parseMathMulDiv(); - if (!left) throw std::runtime_error("Expected left side of 'math plus/minus' expression"); - std::string op_str; - while (!(op_str = consumeToken(plus_minus_tok)).empty()) { - auto right = parseMathMulDiv(); - if (!right) throw std::runtime_error("Expected right side of 'math plus/minus' expression"); - auto op = op_str == "+" ? BinaryOpExpr::Op::Add : BinaryOpExpr::Op::Sub; - left = std::make_shared(get_location(), std::move(left), std::move(right), op); - } - return left; - } - - std::shared_ptr parseMathMulDiv() { - auto left = parseMathUnaryPlusMinus(); - if (!left) throw std::runtime_error("Expected left side of 'math mul/div' expression"); - - static std::regex mul_div_tok(R"(\*\*?|//?|%(?!\}))"); - std::string op_str; - while (!(op_str = consumeToken(mul_div_tok)).empty()) { - auto right = parseMathUnaryPlusMinus(); - if (!right) throw std::runtime_error("Expected right side of 'math mul/div' expression"); - auto op = op_str == "*" ? BinaryOpExpr::Op::Mul - : op_str == "**" ? BinaryOpExpr::Op::MulMul - : op_str == "/" ? BinaryOpExpr::Op::Div - : op_str == "//" ? BinaryOpExpr::Op::DivDiv - : BinaryOpExpr::Op::Mod; - left = std::make_shared(get_location(), std::move(left), std::move(right), op); - } - - if (!consumeToken("|").empty()) { - auto expr = parseMathMulDiv(); - if (auto filter = dynamic_cast(expr.get())) { - filter->prepend(std::move(left)); - return expr; - } else { - std::vector> parts; - parts.emplace_back(std::move(left)); - parts.emplace_back(std::move(expr)); - return std::make_shared(get_location(), std::move(parts)); - } - } - return left; - } - - std::shared_ptr call_func(const std::string & name, ArgumentsExpression && args) const { - return std::make_shared(get_location(), std::make_shared(get_location(), name), std::move(args)); - } - - std::shared_ptr parseMathUnaryPlusMinus() { - static std::regex unary_plus_minus_tok(R"(\+|-(?![}%#]\}))"); - auto op_str = consumeToken(unary_plus_minus_tok); - auto expr = parseExpansion(); - if (!expr) throw std::runtime_error("Expected expr of 'unary plus/minus/expansion' expression"); - - if (!op_str.empty()) { - auto op = op_str == "+" ? UnaryOpExpr::Op::Plus : UnaryOpExpr::Op::Minus; - return std::make_shared(get_location(), std::move(expr), op); - } - return expr; - } - - std::shared_ptr parseExpansion() { - static std::regex expansion_tok(R"(\*\*?)"); - auto op_str = consumeToken(expansion_tok); - auto expr = parseValueExpression(); - if (op_str.empty()) return expr; - if (!expr) throw std::runtime_error("Expected expr of 'expansion' expression"); - return std::make_shared(get_location(), std::move(expr), op_str == "*" ? UnaryOpExpr::Op::Expansion : UnaryOpExpr::Op::ExpansionDict); - } - - std::shared_ptr parseValueExpression() { - auto parseValue = [&]() -> std::shared_ptr { - auto location = get_location(); - auto constant = parseConstant(); - if (constant) return std::make_shared(location, *constant); - - static std::regex null_regex(R"(null\b)"); - if (!consumeToken(null_regex).empty()) return std::make_shared(location, Value()); - - auto identifier = parseIdentifier(); - if (identifier) return identifier; - - auto braced = parseBracedExpressionOrArray(); - if (braced) return braced; - - auto array = parseArray(); - if (array) return array; - - auto dictionary = parseDictionary(); - if (dictionary) return dictionary; - - throw std::runtime_error("Expected value expression"); - }; - - auto value = parseValue(); - - while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) { - if (!consumeToken("[").empty()) { - std::shared_ptr index; - if (!consumeToken(":").empty()) { - auto slice_end = parseExpression(); - index = std::make_shared(slice_end->location, nullptr, std::move(slice_end)); - } else { - auto slice_start = parseExpression(); - if (!consumeToken(":").empty()) { - consumeSpaces(); - if (peekSymbols({ "]" })) { - index = std::make_shared(slice_start->location, std::move(slice_start), nullptr); - } else { - auto slice_end = parseExpression(); - index = std::make_shared(slice_start->location, std::move(slice_start), std::move(slice_end)); - } - } else { - index = std::move(slice_start); - } - } - if (!index) throw std::runtime_error("Empty index in subscript"); - if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript"); - - value = std::make_shared(value->location, std::move(value), std::move(index)); - } else if (!consumeToken(".").empty()) { - auto identifier = parseIdentifier(); - if (!identifier) throw std::runtime_error("Expected identifier in subscript"); - - consumeSpaces(); - if (peekSymbols({ "(" })) { - auto callParams = parseCallArgs(); - value = std::make_shared(identifier->location, std::move(value), std::move(identifier), std::move(callParams)); - } else { - auto key = std::make_shared(identifier->location, Value(identifier->get_name())); - value = std::make_shared(identifier->location, std::move(value), std::move(key)); - } - } - consumeSpaces(); - } - - if (peekSymbols({ "(" })) { - auto location = get_location(); - auto callParams = parseCallArgs(); - value = std::make_shared(location, std::move(value), std::move(callParams)); - } - return value; - } - - std::shared_ptr parseBracedExpressionOrArray() { - if (consumeToken("(").empty()) return nullptr; - - auto expr = parseExpression(); - if (!expr) throw std::runtime_error("Expected expression in braced expression"); - - if (!consumeToken(")").empty()) { - return expr; // Drop the parentheses - } - - std::vector> tuple; - tuple.emplace_back(std::move(expr)); - - while (it != end) { - if (consumeToken(",").empty()) throw std::runtime_error("Expected comma in tuple"); - auto next = parseExpression(); - if (!next) throw std::runtime_error("Expected expression in tuple"); - tuple.push_back(std::move(next)); - - if (!consumeToken(")").empty()) { - return std::make_shared(get_location(), std::move(tuple)); - } - } - throw std::runtime_error("Expected closing parenthesis"); - } - - std::shared_ptr parseArray() { - if (consumeToken("[").empty()) return nullptr; - - std::vector> elements; - if (!consumeToken("]").empty()) { - return std::make_shared(get_location(), std::move(elements)); - } - auto first_expr = parseExpression(); - if (!first_expr) throw std::runtime_error("Expected first expression in array"); - elements.push_back(std::move(first_expr)); - - while (it != end) { - if (!consumeToken(",").empty()) { - auto expr = parseExpression(); - if (!expr) throw std::runtime_error("Expected expression in array"); - elements.push_back(std::move(expr)); - } else if (!consumeToken("]").empty()) { - return std::make_shared(get_location(), std::move(elements)); - } else { - throw std::runtime_error("Expected comma or closing bracket in array"); - } - } - throw std::runtime_error("Expected closing bracket"); - } - - std::shared_ptr parseDictionary() { - if (consumeToken("{").empty()) return nullptr; - - std::vector, std::shared_ptr>> elements; - if (!consumeToken("}").empty()) { - return std::make_shared(get_location(), std::move(elements)); - } - - auto parseKeyValuePair = [&]() { - auto key = parseExpression(); - if (!key) throw std::runtime_error("Expected key in dictionary"); - if (consumeToken(":").empty()) throw std::runtime_error("Expected colon betweek key & value in dictionary"); - auto value = parseExpression(); - if (!value) throw std::runtime_error("Expected value in dictionary"); - elements.emplace_back(std::pair(std::move(key), std::move(value))); - }; - - parseKeyValuePair(); - - while (it != end) { - if (!consumeToken(",").empty()) { - parseKeyValuePair(); - } else if (!consumeToken("}").empty()) { - return std::make_shared(get_location(), std::move(elements)); - } else { - throw std::runtime_error("Expected comma or closing brace in dictionary"); - } - } - throw std::runtime_error("Expected closing brace"); - } - - SpaceHandling parsePreSpace(const std::string& s) const { - if (s == "-") - return SpaceHandling::Strip; - return SpaceHandling::Keep; - } - - SpaceHandling parsePostSpace(const std::string& s) const { - if (s == "-") return SpaceHandling::Strip; - return SpaceHandling::Keep; - } - - using TemplateTokenVector = std::vector>; - using TemplateTokenIterator = TemplateTokenVector::const_iterator; - - std::vector parseVarNames() { - static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)"); - - std::vector group; - if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names"); - std::vector varnames; - std::istringstream iss(group[1]); - std::string varname; - while (std::getline(iss, varname, ',')) { - varnames.push_back(strip(varname)); - } - return varnames; - } - - std::runtime_error unexpected(const TemplateToken & token) const { - return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type) - + error_location_suffix(*template_str, token.location.pos)); - } - std::runtime_error unterminated(const TemplateToken & token) const { - return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type) - + error_location_suffix(*template_str, token.location.pos)); - } - - TemplateTokenVector tokenize() { - static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})"); - static std::regex expr_open_regex(R"(\{\{([-~])?)"); - static std::regex block_open_regex(R"(^\{%([-~])?\s*)"); - static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)"); - static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)"); - static std::regex expr_close_regex(R"(\s*([-~])?\}\})"); - static std::regex block_close_regex(R"(\s*([-~])?%\})"); - - TemplateTokenVector tokens; - std::vector group; - std::string text; - std::smatch match; - - try { - while (it != end) { - auto location = get_location(); - - if (!(group = consumeTokenGroups(comment_tok, SpaceHandling::Keep)).empty()) { - auto pre_space = parsePreSpace(group[1]); - auto content = group[2]; - auto post_space = parsePostSpace(group[3]); - tokens.push_back(std::make_unique(location, pre_space, post_space, content)); - } else if (!(group = consumeTokenGroups(expr_open_regex, SpaceHandling::Keep)).empty()) { - auto pre_space = parsePreSpace(group[1]); - auto expr = parseExpression(); - - if ((group = consumeTokenGroups(expr_close_regex)).empty()) { - throw std::runtime_error("Expected closing expression tag"); - } - - auto post_space = parsePostSpace(group[1]); - tokens.push_back(std::make_unique(location, pre_space, post_space, std::move(expr))); - } else if (!(group = consumeTokenGroups(block_open_regex, SpaceHandling::Keep)).empty()) { - auto pre_space = parsePreSpace(group[1]); - - std::string keyword; - - auto parseBlockClose = [&]() -> SpaceHandling { - if ((group = consumeTokenGroups(block_close_regex)).empty()) throw std::runtime_error("Expected closing block tag"); - return parsePostSpace(group[1]); - }; - - if ((keyword = consumeToken(block_keyword_tok)).empty()) throw std::runtime_error("Expected block keyword"); - - if (keyword == "if") { - auto condition = parseExpression(); - if (!condition) throw std::runtime_error("Expected condition in if block"); - - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, std::move(condition))); - } else if (keyword == "elif") { - auto condition = parseExpression(); - if (!condition) throw std::runtime_error("Expected condition in elif block"); - - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, std::move(condition))); - } else if (keyword == "else") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "endif") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "for") { - static std::regex recursive_tok(R"(recursive\b)"); - static std::regex if_tok(R"(if\b)"); - - auto varnames = parseVarNames(); - static std::regex in_tok(R"(in\b)"); - if (consumeToken(in_tok).empty()) throw std::runtime_error("Expected 'in' keyword in for block"); - auto iterable = parseExpression(/* allow_if_expr = */ false); - if (!iterable) throw std::runtime_error("Expected iterable in for block"); - - std::shared_ptr condition; - if (!consumeToken(if_tok).empty()) { - condition = parseExpression(); - } - auto recursive = !consumeToken(recursive_tok).empty(); - - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, std::move(varnames), std::move(iterable), std::move(condition), recursive)); - } else if (keyword == "endfor") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "generation") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "endgeneration") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "set") { - static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))"); - - std::string ns; - std::vector var_names; - std::shared_ptr value; - if (!(group = consumeTokenGroups(namespaced_var_regex)).empty()) { - ns = group[1]; - var_names.push_back(group[2]); - - if (consumeToken("=").empty()) throw std::runtime_error("Expected equals sign in set block"); - - value = parseExpression(); - if (!value) throw std::runtime_error("Expected value in set block"); - } else { - var_names = parseVarNames(); - - if (!consumeToken("=").empty()) { - value = parseExpression(); - if (!value) throw std::runtime_error("Expected value in set block"); - } - } - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, ns, var_names, std::move(value))); - } else if (keyword == "endset") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "macro") { - auto macroname = parseIdentifier(); - if (!macroname) throw std::runtime_error("Expected macro name in macro block"); - auto params = parseParameters(); - - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, std::move(macroname), std::move(params))); - } else if (keyword == "endmacro") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "filter") { - auto filter = parseExpression(); - if (!filter) throw std::runtime_error("Expected expression in filter block"); - - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, std::move(filter))); - } else if (keyword == "endfilter") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space)); - } else if (keyword == "break" || keyword == "continue") { - auto post_space = parseBlockClose(); - tokens.push_back(std::make_unique(location, pre_space, post_space, keyword == "break" ? LoopControlType::Break : LoopControlType::Continue)); - } else { - throw std::runtime_error("Unexpected block: " + keyword); - } - } else if (std::regex_search(it, end, match, non_text_open_regex)) { - if (!match.position()) { - if (match[0] != "{#") - throw std::runtime_error("Internal error: Expected a comment"); - throw std::runtime_error("Missing end of comment tag"); - } - auto text_end = it + match.position(); - text = std::string(it, text_end); - it = text_end; - tokens.push_back(std::make_unique(location, SpaceHandling::Keep, SpaceHandling::Keep, text)); - } else { - text = std::string(it, end); - it = end; - tokens.push_back(std::make_unique(location, SpaceHandling::Keep, SpaceHandling::Keep, text)); - } - } - return tokens; - } catch (const std::exception & e) { - throw std::runtime_error(e.what() + error_location_suffix(*template_str, std::distance(start, it))); - } - } - - std::shared_ptr parseTemplate( - const TemplateTokenIterator & begin, - TemplateTokenIterator & it, - const TemplateTokenIterator & end, - bool fully = false) const { - std::vector> children; - while (it != end) { - const auto start = it; - const auto & token = *(it++); - if (auto if_token = dynamic_cast(token.get())) { - std::vector, std::shared_ptr>> cascade; - cascade.emplace_back(std::move(if_token->condition), parseTemplate(begin, it, end)); - - while (it != end && (*it)->type == TemplateToken::Type::Elif) { - auto elif_token = dynamic_cast((*(it++)).get()); - cascade.emplace_back(std::move(elif_token->condition), parseTemplate(begin, it, end)); - } - - if (it != end && (*it)->type == TemplateToken::Type::Else) { - cascade.emplace_back(nullptr, parseTemplate(begin, ++it, end)); - } - if (it == end || (*(it++))->type != TemplateToken::Type::EndIf) { - throw unterminated(**start); - } - children.emplace_back(std::make_shared(token->location, std::move(cascade))); - } else if (auto for_token = dynamic_cast(token.get())) { - auto body = parseTemplate(begin, it, end); - auto else_body = std::shared_ptr(); - if (it != end && (*it)->type == TemplateToken::Type::Else) { - else_body = parseTemplate(begin, ++it, end); - } - if (it == end || (*(it++))->type != TemplateToken::Type::EndFor) { - throw unterminated(**start); - } - children.emplace_back(std::make_shared(token->location, std::move(for_token->var_names), std::move(for_token->iterable), std::move(for_token->condition), std::move(body), for_token->recursive, std::move(else_body))); - } else if (dynamic_cast(token.get())) { - auto body = parseTemplate(begin, it, end); - if (it == end || (*(it++))->type != TemplateToken::Type::EndGeneration) { - throw unterminated(**start); - } - // Treat as a no-op, as our scope is templates for inference, not training (`{% generation %}` wraps generated tokens for masking). - children.emplace_back(std::move(body)); - } else if (auto text_token = dynamic_cast(token.get())) { - SpaceHandling pre_space = (it - 1) != begin ? (*(it - 2))->post_space : SpaceHandling::Keep; - SpaceHandling post_space = it != end ? (*it)->pre_space : SpaceHandling::Keep; - - auto text = text_token->text; - if (post_space == SpaceHandling::Strip) { - static std::regex trailing_space_regex(R"(\s+$)"); - text = std::regex_replace(text, trailing_space_regex, ""); - } else if (options.lstrip_blocks && it != end) { - auto i = text.size(); - while (i > 0 && (text[i - 1] == ' ' || text[i - 1] == '\t')) i--; - if ((i == 0 && (it - 1) == begin) || (i > 0 && text[i - 1] == '\n')) { - text.resize(i); - } - } - if (pre_space == SpaceHandling::Strip) { - static std::regex leading_space_regex(R"(^\s+)"); - text = std::regex_replace(text, leading_space_regex, ""); - } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast((*(it - 2)).get())) { - if (text.length() > 0 && text[0] == '\n') { - text.erase(0, 1); - } - } - if (it == end && !options.keep_trailing_newline) { - auto i = text.size(); - if (i > 0 && text[i - 1] == '\n') { - i--; - if (i > 0 && text[i - 1] == '\r') i--; - text.resize(i); - } - } - children.emplace_back(std::make_shared(token->location, text)); - } else if (auto expr_token = dynamic_cast(token.get())) { - children.emplace_back(std::make_shared(token->location, std::move(expr_token->expr))); - } else if (auto set_token = dynamic_cast(token.get())) { - if (set_token->value) { - children.emplace_back(std::make_shared(token->location, set_token->ns, set_token->var_names, std::move(set_token->value))); - } else { - auto value_template = parseTemplate(begin, it, end); - if (it == end || (*(it++))->type != TemplateToken::Type::EndSet) { - throw unterminated(**start); - } - if (!set_token->ns.empty()) throw std::runtime_error("Namespaced set not supported in set with template value"); - if (set_token->var_names.size() != 1) throw std::runtime_error("Structural assignment not supported in set with template value"); - auto & name = set_token->var_names[0]; - children.emplace_back(std::make_shared(token->location, name, std::move(value_template))); - } - } else if (auto macro_token = dynamic_cast(token.get())) { - auto body = parseTemplate(begin, it, end); - if (it == end || (*(it++))->type != TemplateToken::Type::EndMacro) { - throw unterminated(**start); - } - children.emplace_back(std::make_shared(token->location, std::move(macro_token->name), std::move(macro_token->params), std::move(body))); - } else if (auto filter_token = dynamic_cast(token.get())) { - auto body = parseTemplate(begin, it, end); - if (it == end || (*(it++))->type != TemplateToken::Type::EndFilter) { - throw unterminated(**start); - } - children.emplace_back(std::make_shared(token->location, std::move(filter_token->filter), std::move(body))); - } else if (dynamic_cast(token.get())) { - // Ignore comments - } else if (auto ctrl_token = dynamic_cast(token.get())) { - children.emplace_back(std::make_shared(token->location, ctrl_token->control_type)); - } else if (dynamic_cast(token.get()) - || dynamic_cast(token.get()) - || dynamic_cast(token.get()) - || dynamic_cast(token.get()) - || dynamic_cast(token.get()) - || dynamic_cast(token.get()) - || dynamic_cast(token.get()) - || dynamic_cast(token.get())) { - it--; // unconsume the token - break; // exit the loop - } else { - throw unexpected(**(it-1)); - } - } - if (fully && it != end) { - throw unexpected(**it); - } - if (children.empty()) { - return std::make_shared(Location { template_str, 0 }, std::string()); - } else if (children.size() == 1) { - return std::move(children[0]); - } else { - return std::make_shared(children[0]->location(), std::move(children)); - } - } - -public: - - static std::shared_ptr parse(const std::string& template_str, const Options & options) { - Parser parser(std::make_shared(normalize_newlines(template_str)), options); - auto tokens = parser.tokenize(); - TemplateTokenIterator begin = tokens.begin(); - auto it = begin; - TemplateTokenIterator end = tokens.end(); - return parser.parseTemplate(begin, it, end, /* full= */ true); - } -}; - -static Value simple_function(const std::string & fn_name, const std::vector & params, const std::function &, Value & args)> & fn) { - std::map named_positions; - for (size_t i = 0, n = params.size(); i < n; i++) named_positions[params[i]] = i; - - return Value::callable([=](const std::shared_ptr & context, ArgumentsValue & args) -> Value { - auto args_obj = Value::object(); - std::vector provided_args(params.size()); - for (size_t i = 0, n = args.args.size(); i < n; i++) { - auto & arg = args.args[i]; - if (i < params.size()) { - args_obj.set(params[i], arg); - provided_args[i] = true; - } else { - throw std::runtime_error("Too many positional params for " + fn_name); - } - } - for (auto & [name, value] : args.kwargs) { - auto named_pos_it = named_positions.find(name); - if (named_pos_it == named_positions.end()) { - throw std::runtime_error("Unknown argument " + name + " for function " + fn_name); - } - provided_args[named_pos_it->second] = true; - args_obj.set(name, value); - } - return fn(context, args_obj); - }); -} - -inline std::shared_ptr Context::builtins() { - auto globals = Value::object(); - - globals.set("raise_exception", simple_function("raise_exception", { "message" }, [](const std::shared_ptr &, Value & args) -> Value { - throw std::runtime_error(args.at("message").get()); - })); - globals.set("tojson", simple_function("tojson", { "value", "indent" }, [](const std::shared_ptr &, Value & args) { - return Value(args.at("value").dump(args.get("indent", -1), /* tojson= */ true)); - })); - globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr &, Value & args) { - auto items = Value::array(); - if (args.contains("object")) { - auto & obj = args.at("object"); - if (obj.is_string()) { - auto json_obj = json::parse(obj.get()); - for (const auto & kv : json_obj.items()) { - items.push_back(Value::array({kv.key(), kv.value()})); - } - } else if (!obj.is_null()) { - for (auto & key : obj.keys()) { - items.push_back(Value::array({key, obj.at(key)})); - } - } - } - return items; - })); - globals.set("last", simple_function("last", { "items" }, [](const std::shared_ptr &, Value & args) { - auto items = args.at("items"); - if (!items.is_array()) throw std::runtime_error("object is not a list"); - if (items.size() == 0) return Value(); - return items.at(items.size() - 1); - })); - globals.set("trim", simple_function("trim", { "text" }, [](const std::shared_ptr &, Value & args) { - auto & text = args.at("text"); - return text.is_null() ? text : Value(strip(text.get())); - })); - globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr &, Value & args) { - auto text = args.at("text"); - if (text.is_null()) return text; - std::string res; - auto str = text.get(); - std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower); - return Value(res); - })); - globals.set("default", Value::callable([=](const std::shared_ptr &, ArgumentsValue & args) { - args.expectArgs("default", {2, 3}, {0, 1}); - auto & value = args.args[0]; - auto & default_value = args.args[1]; - bool boolean = false; - if (args.args.size() == 3) { - boolean = args.args[2].get(); - } else { - Value bv = args.get_named("boolean"); - if (!bv.is_null()) { - boolean = bv.get(); - } - } - return boolean ? (value.to_bool() ? value : default_value) : value.is_null() ? default_value : value; - })); - auto escape = simple_function("escape", { "text" }, [](const std::shared_ptr &, Value & args) { - return Value(html_escape(args.at("text").get())); - }); - globals.set("e", escape); - globals.set("escape", escape); - globals.set("joiner", simple_function("joiner", { "sep" }, [](const std::shared_ptr &, Value & args) { - auto sep = args.get("sep", ""); - auto first = std::make_shared(true); - return simple_function("", {}, [sep, first](const std::shared_ptr &, const Value &) -> Value { - if (*first) { - *first = false; - return ""; - } - return sep; - }); - return Value(html_escape(args.at("text").get())); - })); - globals.set("count", simple_function("count", { "items" }, [](const std::shared_ptr &, Value & args) { - return Value((int64_t) args.at("items").size()); - })); - globals.set("dictsort", simple_function("dictsort", { "value" }, [](const std::shared_ptr &, Value & args) { - if (args.size() != 1) throw std::runtime_error("dictsort expects exactly 1 argument (TODO: fix implementation)"); - auto & value = args.at("value"); - auto keys = value.keys(); - std::sort(keys.begin(), keys.end()); - auto res = Value::array(); - for (auto & key : keys) { - res.push_back(Value::array({key, value.at(key)})); - } - return res; - })); - globals.set("join", simple_function("join", { "items", "d" }, [](const std::shared_ptr &, Value & args) { - auto do_join = [](Value & items, const std::string & sep) { - if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump()); - std::ostringstream oss; - auto first = true; - for (size_t i = 0, n = items.size(); i < n; ++i) { - if (first) first = false; - else oss << sep; - oss << items.at(i).to_str(); - } - return Value(oss.str()); - }; - auto sep = args.get("d", ""); - if (args.contains("items")) { - auto & items = args.at("items"); - return do_join(items, sep); - } else { - return simple_function("", {"items"}, [sep, do_join](const std::shared_ptr &, Value & args) { - auto & items = args.at("items"); - if (!items.to_bool() || !items.is_array()) throw std::runtime_error("join expects an array for items, got: " + items.dump()); - return do_join(items, sep); - }); - } - })); - globals.set("namespace", Value::callable([=](const std::shared_ptr &, ArgumentsValue & args) { - auto ns = Value::object(); - args.expectArgs("namespace", {0, 0}, {0, (std::numeric_limits::max)()}); - for (auto & [name, value] : args.kwargs) { - ns.set(name, value); - } - return ns; - })); - auto equalto = simple_function("equalto", { "expected", "actual" }, [](const std::shared_ptr &, Value & args) -> Value { - return args.at("actual") == args.at("expected"); - }); - globals.set("equalto", equalto); - globals.set("==", equalto); - globals.set("length", simple_function("length", { "items" }, [](const std::shared_ptr &, Value & args) -> Value { - auto & items = args.at("items"); - return (int64_t) items.size(); - })); - globals.set("safe", simple_function("safe", { "value" }, [](const std::shared_ptr &, Value & args) -> Value { - return args.at("value").to_str(); - })); - globals.set("string", simple_function("string", { "value" }, [](const std::shared_ptr &, Value & args) -> Value { - return args.at("value").to_str(); - })); - globals.set("int", simple_function("int", { "value" }, [](const std::shared_ptr &, Value & args) -> Value { - return args.at("value").to_int(); - })); - globals.set("list", simple_function("list", { "items" }, [](const std::shared_ptr &, Value & args) -> Value { - auto & items = args.at("items"); - if (!items.is_array()) throw std::runtime_error("object is not iterable"); - return items; - })); - globals.set("unique", simple_function("unique", { "items" }, [](const std::shared_ptr &, Value & args) -> Value { - auto & items = args.at("items"); - if (!items.is_array()) throw std::runtime_error("object is not iterable"); - std::unordered_set seen; - auto result = Value::array(); - for (size_t i = 0, n = items.size(); i < n; i++) { - auto pair = seen.insert(items.at(i)); - if (pair.second) { - result.push_back(items.at(i)); - } - } - return result; - })); - auto make_filter = [](const Value & filter, Value & extra_args) -> Value { - return simple_function("", { "value" }, [=](const std::shared_ptr & context, Value & args) { - auto & value = args.at("value"); - ArgumentsValue actual_args; - actual_args.args.emplace_back(value); - for (size_t i = 0, n = extra_args.size(); i < n; i++) { - actual_args.args.emplace_back(extra_args.at(i)); - } - return filter.call(context, actual_args); - }); - }; - auto select_or_reject = [make_filter](bool is_select) { - return Value::callable([=](const std::shared_ptr & context, ArgumentsValue & args) { - args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits::max)()}, {0, 0}); - auto & items = args.args[0]; - if (items.is_null()) - return Value::array(); - if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump()); - - auto filter_fn = context->get(args.args[1]); - if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump()); - - auto filter_args = Value::array(); - for (size_t i = 2, n = args.args.size(); i < n; i++) { - filter_args.push_back(args.args[i]); - } - auto filter = make_filter(filter_fn, filter_args); - - auto res = Value::array(); - for (size_t i = 0, n = items.size(); i < n; i++) { - auto & item = items.at(i); - ArgumentsValue filter_args; - filter_args.args.emplace_back(item); - auto pred_res = filter.call(context, filter_args); - if (pred_res.to_bool() == (is_select ? true : false)) { - res.push_back(item); - } - } - return res; - }); - }; - globals.set("select", select_or_reject(/* is_select= */ true)); - globals.set("reject", select_or_reject(/* is_select= */ false)); - globals.set("map", Value::callable([=](const std::shared_ptr & context, ArgumentsValue & args) { - auto res = Value::array(); - if (args.args.size() == 1 && - ((args.has_named("attribute") && args.kwargs.size() == 1) || (args.has_named("default") && args.kwargs.size() == 2))) { - auto & items = args.args[0]; - auto attr_name = args.get_named("attribute"); - auto default_value = args.get_named("default"); - for (size_t i = 0, n = items.size(); i < n; i++) { - auto & item = items.at(i); - auto attr = item.get(attr_name); - res.push_back(attr.is_null() ? default_value : attr); - } - } else if (args.kwargs.empty() && args.args.size() >= 2) { - auto fn = context->get(args.args[1]); - if (fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump()); - ArgumentsValue filter_args { {Value()}, {} }; - for (size_t i = 2, n = args.args.size(); i < n; i++) { - filter_args.args.emplace_back(args.args[i]); - } - for (size_t i = 0, n = args.args[0].size(); i < n; i++) { - auto & item = args.args[0].at(i); - filter_args.args[0] = item; - res.push_back(fn.call(context, filter_args)); - } - } else { - throw std::runtime_error("Invalid or unsupported arguments for map"); - } - return res; - })); - globals.set("indent", simple_function("indent", { "text", "indent", "first" }, [](const std::shared_ptr &, Value & args) { - auto text = args.at("text").get(); - auto first = args.get("first", false); - std::string out; - std::string indent(args.get("indent", 0), ' '); - std::istringstream iss(text); - std::string line; - auto is_first = true; - while (std::getline(iss, line, '\n')) { - auto needs_indent = !is_first || first; - if (is_first) is_first = false; - else out += "\n"; - if (needs_indent) out += indent; - out += line; - } - if (!text.empty() && text.back() == '\n') out += "\n"; - return out; - })); - auto select_or_reject_attr = [](bool is_select) { - return Value::callable([=](const std::shared_ptr & context, ArgumentsValue & args) { - args.expectArgs(is_select ? "selectattr" : "rejectattr", {2, (std::numeric_limits::max)()}, {0, 0}); - auto & items = args.args[0]; - if (items.is_null()) - return Value::array(); - if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump()); - auto attr_name = args.args[1].get(); - - bool has_test = false; - Value test_fn; - ArgumentsValue test_args {{Value()}, {}}; - if (args.args.size() >= 3) { - has_test = true; - test_fn = context->get(args.args[2]); - if (test_fn.is_null()) throw std::runtime_error("Undefined test: " + args.args[2].dump()); - for (size_t i = 3, n = args.args.size(); i < n; i++) { - test_args.args.emplace_back(args.args[i]); - } - test_args.kwargs = args.kwargs; - } - - auto res = Value::array(); - for (size_t i = 0, n = items.size(); i < n; i++) { - auto & item = items.at(i); - auto attr = item.get(attr_name); - if (has_test) { - test_args.args[0] = attr; - if (test_fn.call(context, test_args).to_bool() == (is_select ? true : false)) { - res.push_back(item); - } - } else { - res.push_back(attr); - } - } - return res; - }); - }; - globals.set("selectattr", select_or_reject_attr(/* is_select= */ true)); - globals.set("rejectattr", select_or_reject_attr(/* is_select= */ false)); - globals.set("range", Value::callable([=](const std::shared_ptr &, ArgumentsValue & args) { - std::vector startEndStep(3); - std::vector param_set(3); - if (args.args.size() == 1) { - startEndStep[1] = args.args[0].get(); - param_set[1] = true; - } else { - for (size_t i = 0; i < args.args.size(); i++) { - auto & arg = args.args[i]; - auto v = arg.get(); - startEndStep[i] = v; - param_set[i] = true; - } - } - for (auto & [name, value] : args.kwargs) { - size_t i; - if (name == "start") i = 0; - else if (name == "end") i = 1; - else if (name == "step") i = 2; - else throw std::runtime_error("Unknown argument " + name + " for function range"); - - if (param_set[i]) { - throw std::runtime_error("Duplicate argument " + name + " for function range"); - } - startEndStep[i] = value.get(); - param_set[i] = true; - } - if (!param_set[1]) { - throw std::runtime_error("Missing required argument 'end' for function range"); - } - int64_t start = param_set[0] ? startEndStep[0] : 0; - int64_t end = startEndStep[1]; - int64_t step = param_set[2] ? startEndStep[2] : 1; - - auto res = Value::array(); - if (step > 0) { - for (int64_t i = start; i < end; i += step) { - res.push_back(Value(i)); - } - } else { - for (int64_t i = start; i > end; i += step) { - res.push_back(Value(i)); - } - } - return res; - })); - - return std::make_shared(std::move(globals)); -} - -inline std::shared_ptr Context::make(Value && values, const std::shared_ptr & parent) { - return std::make_shared(values.is_null() ? Value::object() : std::move(values), parent); -} - -} // namespace minja diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index a057ae45f..3ca112ef1 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -2,13 +2,10 @@ #include "common.h" #include "log.h" -#include #include -#include #include -#include -void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, +void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { const int64_t t_start_ms = ggml_time_ms(); const int64_t inp_size = inp.size(); @@ -20,16 +17,16 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, const int64_t i_start = std::max(inp_size - nnew, ngram_size); for (int64_t i = i_start; i < inp_size; ++i) { const int64_t ngram_start = i - ngram_size; - common_ngram ngram(&inp[ngram_start], ngram_size); + llama_ngram ngram(&inp[ngram_start], ngram_size); const llama_token token = inp[i]; - common_ngram_cache::iterator part_it = ngram_cache.find(ngram); + llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); if (part_it == ngram_cache.end()) { - common_ngram_cache_part part; + llama_ngram_cache_part part; part.emplace(token, 1); ngram_cache.emplace(ngram, part); } else { - common_ngram_cache_part::iterator token_count_it = part_it->second.find(token); + llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); if (token_count_it == part_it->second.end()) { part_it->second.emplace(token, 1); } else { @@ -62,16 +59,16 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; // Helper function that tries to draft a token from only the static ngram cache: -static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { - common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); +static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); if (part_static_it == nc_static.end()) { - return LLAMA_TOKEN_NULL; + return -1; } - const common_ngram_cache_part part_static = part_static_it->second; + const llama_ngram_cache_part part_static = part_static_it->second; int max_count_static = 0; int sum_count_static = 0; - llama_token max_token = LLAMA_TOKEN_NULL; + llama_token max_token = -1; for (std::pair token_count_static : part_static) { const llama_token token = token_count_static.first; @@ -85,39 +82,39 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram } if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { - return LLAMA_TOKEN_NULL; + return -1; } if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { - return LLAMA_TOKEN_NULL; + return -1; } return max_token; } // Try to draft a token from primary cache (context/dynamic), validate with static cache: static llama_token try_draft( - common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, + llama_ngram_cache & nc_primary, const std::vector & ngrams_primary, llama_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { - llama_token drafted_token = LLAMA_TOKEN_NULL; + llama_token drafted_token = -1; - for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) { - const common_ngram ngram_primary = ngrams_primary[i]; + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + const llama_ngram ngram_primary = ngrams_primary[i]; - common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); + llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); if (part_primary_it == nc_primary.end()) { continue; } - const common_ngram_cache_part part_primary = part_primary_it->second; + const llama_ngram_cache_part part_primary = part_primary_it->second; int max_count_primary = 0; int max_count_static = 0; int sum_count_primary = 0; - llama_token max_token = LLAMA_TOKEN_NULL; + llama_token max_token = -1; for (std::pair token_count_primary : part_primary) { const llama_token token = token_count_primary.first; - common_ngram_cache_part::iterator token_count_static_it = part_static.find(token); + llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); const int32_t count_primary = token_count_primary.second; const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; @@ -142,9 +139,9 @@ static llama_token try_draft( return drafted_token; } -void common_ngram_cache_draft( +void llama_ngram_cache_draft( std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, - common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static ) { GGML_ASSERT(draft.size() == 1); const int inp_size = inp.size(); @@ -154,40 +151,40 @@ void common_ngram_cache_draft( } while ((int) draft.size()-1 < n_draft) { - llama_token drafted_token = LLAMA_TOKEN_NULL; + llama_token drafted_token = -1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; - common_ngram ngram_static; + llama_ngram ngram_static; for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); } - common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); - common_ngram_cache_part part_static; + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + llama_ngram_cache_part part_static; if (part_static_it != nc_static.end()) { part_static = part_static_it->second; } // cd = context + dynamic - std::vector ngrams_cd; + std::vector ngrams_cd; for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; - common_ngram ngram_cd; + llama_ngram ngram_cd; for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); } ngrams_cd.push_back(ngram_cd); } - if (drafted_token == LLAMA_TOKEN_NULL) { + if (drafted_token == -1) { drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); } - if (drafted_token == LLAMA_TOKEN_NULL) { + if (drafted_token == -1) { drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); } - if (drafted_token == LLAMA_TOKEN_NULL) { + if (drafted_token == -1) { drafted_token = try_draft(nc_static, ngram_static); } - if (drafted_token == LLAMA_TOKEN_NULL) { + if (drafted_token == -1) { break; } @@ -196,16 +193,16 @@ void common_ngram_cache_draft( } } -void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) { +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { std::ofstream file_out(filename, std::ios::binary); - for (std::pair item : ngram_cache) { - const common_ngram ngram = item.first; - common_ngram_cache_part token_counts = item.second; + for (std::pair item : ngram_cache) { + const llama_ngram ngram = item.first; + llama_ngram_cache_part token_counts = item.second; GGML_ASSERT(!token_counts.empty()); const int32_t ntokens = token_counts.size(); GGML_ASSERT(ntokens > 0); - file_out.write(reinterpret_cast(&ngram), sizeof(common_ngram)); + file_out.write(reinterpret_cast(&ngram), sizeof(llama_ngram)); file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); for (std::pair item2 : token_counts) { const llama_token token = item2.first; @@ -219,14 +216,14 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil } -common_ngram_cache common_ngram_cache_load(std::string & filename) { +llama_ngram_cache llama_ngram_cache_load(std::string & filename) { std::ifstream hashmap_file(filename, std::ios::binary); if (!hashmap_file) { throw std::ifstream::failure("Unable to open file " + filename); } - common_ngram_cache ngram_cache; + llama_ngram_cache ngram_cache; - common_ngram ngram; + llama_ngram ngram; int32_t ntokens; llama_token token; int32_t count; @@ -235,11 +232,11 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) { char * ntokensc = reinterpret_cast(&ntokens); char * tokenc = reinterpret_cast(&token); char * countc = reinterpret_cast(&count); - while(hashmap_file.read(ngramc, sizeof(common_ngram))) { + while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); GGML_ASSERT(ntokens > 0); - common_ngram_cache_part token_counts; + llama_ngram_cache_part token_counts; for (int i = 0; i < ntokens; ++i) { GGML_ASSERT(!hashmap_file.eof()); @@ -257,12 +254,12 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) { return ngram_cache; } -void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) { - for (std::pair ngram_part : ngram_cache_add) { - const common_ngram ngram = ngram_part.first; - common_ngram_cache_part part = ngram_part.second; +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { + for (std::pair ngram_part : ngram_cache_add) { + const llama_ngram ngram = ngram_part.first; + llama_ngram_cache_part part = ngram_part.second; - common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); + llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); if (part_merged_it == ngram_cache_target.end()) { ngram_cache_target.emplace(ngram, part); continue; @@ -273,7 +270,7 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng const int32_t count = token_count.second; GGML_ASSERT(count > 0); - common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); + llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); if (token_count_merged_it == part_merged_it->second.end()) { part_merged_it->second.emplace(token, count); continue; diff --git a/common/ngram-cache.h b/common/ngram-cache.h index dfe012abe..ab4c9b376 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -12,22 +12,22 @@ // Data structures to map n-grams to empirical token probabilities: -struct common_ngram { +struct llama_ngram { llama_token tokens[LLAMA_NGRAM_MAX]; - common_ngram() { + llama_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = LLAMA_TOKEN_NULL; + tokens[i] = -1; } } - common_ngram(const llama_token * input, const int ngram_size) { + llama_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL; + tokens[i] = i < ngram_size ? input[i] : -1; } } - bool operator==(const common_ngram & other) const { + bool operator==(const llama_ngram & other) const { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { if (tokens[i] != other.tokens[i]) { return false; @@ -37,28 +37,28 @@ struct common_ngram { } }; -struct common_token_hash_function { +struct llama_token_hash_function { size_t operator()(const llama_token token) const { // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ return token * 11400714819323198485llu; } }; -struct common_ngram_hash_function { - size_t operator()(const common_ngram & ngram) const { - size_t hash = common_token_hash_function{}(ngram.tokens[0]); +struct llama_ngram_hash_function { + size_t operator()(const llama_ngram & ngram) const { + size_t hash = llama_token_hash_function{}(ngram.tokens[0]); for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { - hash ^= common_token_hash_function{}(ngram.tokens[i]); + hash ^= llama_token_hash_function{}(ngram.tokens[i]); } return hash; } }; // token -> number of times token has been seen -typedef std::unordered_map common_ngram_cache_part; +typedef std::unordered_map llama_ngram_cache_part; // n-gram -> empirical distribution of following tokens -typedef std::unordered_map common_ngram_cache; +typedef std::unordered_map llama_ngram_cache; // Update an ngram cache with tokens. @@ -70,8 +70,8 @@ typedef std::unordered_map & inp_data, int nnew, bool print_progress); +void llama_ngram_cache_update( + llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); // Try to draft tokens from ngram caches. // inp: the tokens generated so far. @@ -81,21 +81,21 @@ void common_ngram_cache_update( // nc_context: ngram cache based on current context. // nc_dynamic: ngram cache based on previous user generations. // nc_static: ngram cache generated from a large text corpus, used for validation. -void common_ngram_cache_draft( +void llama_ngram_cache_draft( std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, - common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static); + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); // Save an ngram cache to a file. // ngram_cache: the ngram cache to save. // filename: the path under which to save the ngram cache. -void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename); +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); -// Load an ngram cache saved with common_ngram_cache_save. +// Load an ngram cache saved with llama_ngram_cache_save. // filename: the path from which to load the ngram cache. // returns: an ngram cache containing the information saved to filename. -common_ngram_cache common_ngram_cache_load(std::string & filename); +llama_ngram_cache llama_ngram_cache_load(std::string & filename); // Merge two ngram caches. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. // ngram_cache_add: the ngram cache to add to ngram_cache_target. -void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add); +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); diff --git a/common/sampling.cpp b/common/sampling.cpp index e4b21ca10..079e40516 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,526 +1,460 @@ +#define LLAMA_API_INTERNAL #include "sampling.h" +#include -#include "common.h" +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { + struct llama_sampling_context * result = new llama_sampling_context(); -#include -#include + result->params = params; + result->grammar = nullptr; -// the ring buffer works similarly to std::deque, but with a fixed capacity -// TODO: deduplicate with llama-impl.h -template -struct ring_buffer { - ring_buffer(size_t cap) : capacity(cap), data(cap) {} + // if there is a grammar, parse it + if (!params.grammar.empty()) { + result->parsed_grammar = grammar_parser::parse(params.grammar.c_str()); - T & front() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[first]; - } - - const T & front() const { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[first]; - } - - T & back() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[pos]; - } - - const T & back() const { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - return data[pos]; - } - - void push_back(const T & value) { - if (sz == capacity) { - // advance the start when buffer is full - first = (first + 1) % capacity; - } else { - sz++; - } - data[pos] = value; - pos = (pos + 1) % capacity; - } - - T pop_front() { - if (sz == 0) { - throw std::runtime_error("ring buffer is empty"); - } - T value = data[first]; - first = (first + 1) % capacity; - sz--; - return value; - } - - const T & rat(size_t i) const { - if (i >= sz) { - throw std::runtime_error("ring buffer: index out of bounds"); - } - return data[(first + sz - i - 1) % capacity]; - } - - std::vector to_vector() const { - std::vector result; - result.reserve(sz); - for (size_t i = 0; i < sz; i++) { - result.push_back(data[(first + i) % capacity]); - } - return result; - } - - void clear() { - // here only reset the status of the buffer - sz = 0; - first = 0; - pos = 0; - } - - bool empty() const { - return sz == 0; - } - - size_t size() const { - return sz; - } - - size_t capacity = 0; - size_t sz = 0; - size_t first = 0; - size_t pos = 0; - std::vector data; -}; - -struct common_sampler { - common_params_sampling params; - - struct llama_sampler * grmr; - struct llama_sampler * chain; - - ring_buffer prev; - - std::vector cur; - - llama_token_data_array cur_p; - - void set_logits(struct llama_context * ctx, int idx) { - const auto * logits = llama_get_logits_ith(ctx, idx); - - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_vocab = llama_vocab_n_tokens(vocab); - - cur.resize(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + // will be empty (default) if there are parse errors + if (result->parsed_grammar.rules.empty()) { + fprintf(stderr, "%s: failed to parse grammar\n", __func__); + delete result; + return nullptr; } - cur_p = { cur.data(), cur.size(), -1, false }; - } -}; + // Ensure that there is a "root" node. + if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) { + fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__); + delete result; + return nullptr; + } -std::string common_params_sampling::print() const { + std::vector grammar_rules(result->parsed_grammar.c_rules()); + + struct llama_grammar * grammar = llama_grammar_init( + grammar_rules.data(), + grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } + result->grammar = grammar; + } + + result->prev.resize(params.n_prev); + + result->n_valid = 0; + + llama_sampling_set_rng_seed(result, params.seed); + + return result; +} + +void llama_sampling_free(struct llama_sampling_context * ctx) { + if (ctx->grammar != NULL) { + llama_grammar_free(ctx->grammar); + } + + delete ctx; +} + +void llama_sampling_reset(llama_sampling_context * ctx) { + if (ctx->grammar != NULL) { + llama_grammar_free(ctx->grammar); + ctx->grammar = NULL; + } + + if (!ctx->parsed_grammar.rules.empty()) { + std::vector grammar_rules(ctx->parsed_grammar.c_rules()); + + struct llama_grammar * grammar = llama_grammar_init( + grammar_rules.data(), + grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } + ctx->grammar = grammar; + } + + std::fill(ctx->prev.begin(), ctx->prev.end(), 0); + ctx->cur.clear(); + ctx->n_valid = 0; +} + +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + seed = std::random_device{}(); + } + ctx->rng.seed(seed); +} + +void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) { + if (dst->grammar) { + llama_grammar_free(dst->grammar); + dst->grammar = nullptr; + } + + if (src->grammar) { + dst->grammar = llama_grammar_copy(src->grammar); + } + + dst->prev = src->prev; +} + +llama_token llama_sampling_last(llama_sampling_context * ctx) { + return ctx->prev.back(); +} + +std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) { + const int size = ctx_sampling->prev.size(); + + n = std::min(n, size); + + std::string result; + + for (int i = size - n; i < size; i++) { + result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]); + } + + return result; +} + +std::string llama_sampling_print(const llama_sampling_params & params) { char result[1024]; snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" - "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", - penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, - top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, - mirostat, mirostat_eta, mirostat_tau); + params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, + params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, + params.mirostat, params.mirostat_eta, params.mirostat_tau); return std::string(result); } -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { - const llama_vocab * vocab = llama_model_get_vocab(model); - - llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - - lparams.no_perf = params.no_perf; - - std::vector trigger_words; - trigger_words.reserve(params.grammar_trigger_words.size()); - for (const auto & str : params.grammar_trigger_words) { - trigger_words.push_back(str.word.c_str()); - } - - struct llama_sampler * grmr; - if (params.grammar.compare(0, 11, "%llguidance") == 0) { -#ifdef LLAMA_USE_LLGUIDANCE - grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()); -#else - GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); -#endif // LLAMA_USE_LLGUIDANCE - } else { - grmr = params.grammar_lazy - ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root", - trigger_words.data(), trigger_words.size(), - params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size()) - : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); - } - - auto * result = new common_sampler { - /* .params = */ params, - /* .grmr = */ grmr, - /* .chain = */ llama_sampler_chain_init(lparams), - /* .prev = */ ring_buffer(std::max(32, params.n_prev)), - /* .cur = */ {}, - /* .cur_p = */ {}, - }; - - llama_sampler_chain_add(result->chain, - llama_sampler_init_logit_bias( - llama_vocab_n_tokens(vocab), - params.logit_bias.size(), - params.logit_bias.data())); - +std::string llama_sampling_order_print(const llama_sampling_params & params) { + std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { - for (const auto & cnstr : params.samplers) { - switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: - { - std::vector c_breakers; - c_breakers.reserve(params.dry_sequence_breakers.size()); - for (const auto & str : params.dry_sequence_breakers) { - c_breakers.push_back(str.c_str()); - } - - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); - } - break; - case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); - break; - case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); - break; - case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); - break; - case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); - break; - case COMMON_SAMPLER_TYPE_PENALTIES: - llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); - break; - default: - GGML_ASSERT(false && "unknown sampler type"); + for (auto sampler_type : params.samplers_sequence) { + const auto sampler_type_name = llama_sampling_type_to_str(sampler_type); + if (!sampler_type_name.empty()) { + result += "-> " + sampler_type_name + " "; } } - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); - } else if (params.mirostat == 1) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); - } else if (params.mirostat == 2) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); } else { - GGML_ASSERT(false && "unknown mirostat version"); + result += "-> mirostat "; } return result; } -void common_sampler_free(struct common_sampler * gsmpl) { - if (gsmpl) { - llama_sampler_free(gsmpl->grmr); - - llama_sampler_free(gsmpl->chain); - - delete gsmpl; - } -} - -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { - if (accept_grammar) { - llama_sampler_accept(gsmpl->grmr, token); - } - - llama_sampler_accept(gsmpl->chain, token); - - gsmpl->prev.push_back(token); -} - -void common_sampler_reset(struct common_sampler * gsmpl) { - llama_sampler_reset(gsmpl->grmr); - - llama_sampler_reset(gsmpl->chain); -} - -struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { - return new common_sampler { - /* .params = */ gsmpl->params, - /* .grmr = */ llama_sampler_clone(gsmpl->grmr), - /* .chain = */ llama_sampler_clone(gsmpl->chain), - /* .prev = */ gsmpl->prev, - /* .cur = */ gsmpl->cur, - /* .cur_p = */ gsmpl->cur_p, - }; -} - -void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { - // TODO: measure grammar performance - - if (gsmpl) { - llama_perf_sampler_print(gsmpl->chain); - } - if (ctx) { - llama_perf_context_print(ctx); - } -} - -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { - gsmpl->set_logits(ctx, idx); - - auto & grmr = gsmpl->grmr; - auto & chain = gsmpl->chain; - auto & cur_p = gsmpl->cur_p; // initialized by set_logits - - if (grammar_first) { - llama_sampler_apply(grmr, &cur_p); - } - - llama_sampler_apply(chain, &cur_p); - - GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); - - const llama_token id = cur_p.data[cur_p.selected].id; - - if (grammar_first) { - return id; - } - - // check if it the sampled token fits the grammar - { - llama_token_data single_token_data = { id, 1.0f, 0.0f }; - llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; - - llama_sampler_apply(grmr, &single_token_data_array); - - const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; - if (is_valid) { - return id; - } - } - - // resampling: - // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain - gsmpl->set_logits(ctx, idx); - - llama_sampler_apply(grmr, &cur_p); - llama_sampler_apply(chain, &cur_p); - - GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration"); - - return cur_p.data[cur_p.selected].id; -} - -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { - GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); - - std::vector result; - result.reserve(idxs.size()); - - size_t i = 0; - for (; i < draft.size(); i++) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); - - common_sampler_accept(gsmpl, id, true); - - result.push_back(id); - - if (draft[i] != id) { - break; - } - } - - if (i == draft.size()) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); - - common_sampler_accept(gsmpl, id, true); - - result.push_back(id); - } - - return result; -} - -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { - std::vector idxs(draft.size() + 1); - for (size_t i = 0; i < idxs.size(); ++i) { - idxs[i] = i; - } - - return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); -} - -uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { - return llama_sampler_get_seed(gsmpl->chain); -} - -// helpers - -llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { - return &gsmpl->cur_p; -} - -llama_token common_sampler_last(const struct common_sampler * gsmpl) { - return gsmpl->prev.rat(0); -} - -std::string common_sampler_print(const struct common_sampler * gsmpl) { - std::string result = "logits "; - - for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { - const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); - result += std::string("-> ") + llama_sampler_name(smpl) + " "; - } - - return result; -} - -std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) { - n = std::min(n, (int) gsmpl->prev.size()); - - if (n <= 0) { - return ""; - } - - std::string result; - result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab - - for (int i = n - 1; i >= 0; i--) { - const llama_token id = gsmpl->prev.rat(i); - - GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); - - result += common_token_to_piece(ctx_main, id); - } - - return result; -} - -char common_sampler_type_to_chr(enum common_sampler_type cnstr) { - switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: return 'd'; - case COMMON_SAMPLER_TYPE_TOP_K: return 'k'; - case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y'; - case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; - case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; - case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; - case COMMON_SAMPLER_TYPE_XTC: return 'x'; - case COMMON_SAMPLER_TYPE_INFILL: return 'i'; - case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; - default : return '?'; - } -} - -std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { - switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: return "dry"; - case COMMON_SAMPLER_TYPE_TOP_K: return "top_k"; - case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; - case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; - case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; - case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; - case COMMON_SAMPLER_TYPE_XTC: return "xtc"; - case COMMON_SAMPLER_TYPE_INFILL: return "infill"; - case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; +std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; default : return ""; } } -std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - { "dry", COMMON_SAMPLER_TYPE_DRY }, - { "top_k", COMMON_SAMPLER_TYPE_TOP_K }, - { "top_p", COMMON_SAMPLER_TYPE_TOP_P }, - { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P }, - { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, - { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, - { "xtc", COMMON_SAMPLER_TYPE_XTC }, - { "infill", COMMON_SAMPLER_TYPE_INFILL }, - { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, +std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} }; // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - { "top-k", COMMON_SAMPLER_TYPE_TOP_K }, - { "top-p", COMMON_SAMPLER_TYPE_TOP_P }, - { "nucleus", COMMON_SAMPLER_TYPE_TOP_P }, - { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, - { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P }, - { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, - { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, - { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, - { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, + std::unordered_map sampler_alt_name_map { + {"top-k", llama_sampler_type::TOP_K}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMPERATURE} }; - std::vector samplers; - samplers.reserve(names.size()); + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } + } + } + return sampler_types; +} - for (const auto & name : names) { - auto sampler = sampler_canonical_name_map.find(name); - if (sampler != sampler_canonical_name_map.end()) { - samplers.push_back(sampler->second); +std::vector llama_sampling_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + +// no reasons to expose this function in header +static void sampler_queue( + struct llama_context * ctx_main, + const llama_sampling_params & params, + llama_token_data_array & cur_p, + size_t min_keep) { + const float temp = params.temp; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; + const int32_t top_k = params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const std::vector & samplers_sequence = params.samplers_sequence; + + for (auto sampler_type : samplers_sequence) { + switch (sampler_type) { + case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::TEMPERATURE: + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; + default : break; + } + } +} + +static llama_token llama_sampling_sample_impl( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx, + bool is_resampling) { + const llama_sampling_params & params = ctx_sampling->params; + + const float temp = params.temp; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + + std::vector original_logits; + auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits); + if (ctx_sampling->grammar != NULL && !is_resampling) { + GGML_ASSERT(!original_logits.empty()); + } + llama_token id = 0; + + if (temp < 0.0) { + // greedy sampling, with probs + llama_sample_softmax(ctx_main, &cur_p); + id = cur_p.data[0].id; + } else if (temp == 0.0) { + // greedy sampling, no probs + id = llama_sample_token_greedy(ctx_main, &cur_p); + } else { + if (mirostat == 1) { + const int mirostat_m = 100; + llama_sample_temp(ctx_main, &cur_p, temp); + id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu); + } else if (mirostat == 2) { + llama_sample_temp(ctx_main, &cur_p, temp); + id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); } else { - if (allow_alt_names) { - sampler = sampler_alt_name_map.find(name); - if (sampler != sampler_alt_name_map.end()) { - samplers.push_back(sampler->second); + // temperature sampling + size_t min_keep = std::max(1, params.min_keep); + + sampler_queue(ctx_main, params, cur_p, min_keep); + + id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng); + + //{ + // const int n_top = 10; + // LOG("top %d candidates:\n", n_top); + + // for (int i = 0; i < n_top; i++) { + // const llama_token id = cur_p.data[i].id; + // (void)id; // To avoid a warning that id is unused when logging is disabled. + // LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p); + // } + //} + + //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); + } + } + + if (ctx_sampling->grammar != NULL && !is_resampling) { + // Get a pointer to the logits + float * logits = llama_get_logits_ith(ctx_main, idx); + + // Create an array with a single token data element for the sampled id + llama_token_data single_token_data = {id, logits[id], 0.0f}; + llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; + + // Apply grammar constraints to the single token + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array); + + // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY + bool is_valid = single_token_data_array.data[0].logit != -INFINITY; + + // If the token is not valid according to the grammar, perform resampling + if (!is_valid) { + LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str()); + + // Restore logits from the copy + std::copy(original_logits.begin(), original_logits.end(), logits); + + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true); + } + } + + ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size; + + return id; +} + +static llama_token_data_array llama_sampling_prepare_impl( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx, + bool apply_grammar, + std::vector * original_logits) { + const llama_sampling_params & params = ctx_sampling->params; + + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; + + const bool penalize_nl = params.penalize_nl; + + auto & prev = ctx_sampling->prev; + auto & cur = ctx_sampling->cur; + + // Get a pointer to the logits + float * logits = llama_get_logits_ith(ctx_main, idx); + + if (ctx_sampling->grammar != NULL && !apply_grammar) { + GGML_ASSERT(original_logits != NULL); + // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. + *original_logits = {logits, logits + n_vocab}; + } + + // apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + + cur.resize(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + + // apply penalties + const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; + const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); + if (penalty_tokens_used_size) { + const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; + + llama_sample_repetition_penalties(ctx_main, &cur_p, + penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, + penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); + + if (!penalize_nl) { + for (size_t idx = 0; idx < cur_p.size; idx++) { + if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { + cur_p.data[idx].logit = nl_logit; + break; } } } } - return samplers; -} - -std::vector common_sampler_types_from_chars(const std::string & chars) { - std::unordered_map sampler_name_map = { - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES }, - }; - - std::vector samplers; - samplers.reserve(chars.size()); - - for (const auto & c : chars) { - const auto sampler = sampler_name_map.find(c); - if (sampler != sampler_name_map.end()) { - samplers.push_back(sampler->second); - } + // apply grammar checks before sampling logic + if (apply_grammar && ctx_sampling->grammar != NULL) { + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p); } - return samplers; + return cur_p; +} + +llama_token llama_sampling_sample( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx) { + // Call the implementation function with is_resampling set to false by default + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false); +} + +llama_token_data_array llama_sampling_prepare( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx, + bool apply_grammar, + std::vector * original_logits) { + return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits); +} + +void llama_sampling_accept( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + llama_token id, + bool apply_grammar) { + ctx_sampling->prev.erase(ctx_sampling->prev.begin()); + ctx_sampling->prev.push_back(id); + + if (ctx_sampling->grammar != NULL && apply_grammar) { + llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id); + } } diff --git a/common/sampling.h b/common/sampling.h index 2064421db..eeaa53b8b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -2,106 +2,159 @@ #include "llama.h" -#include "common.h" +#include "grammar-parser.h" +#include #include +#include #include -// common_sampler extends llama_sampler with additional functionality: +// sampler types +enum class llama_sampler_type : char { + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMPERATURE = 't' +}; + +// sampling parameters +typedef struct llama_sampling_params { + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context + + std::vector samplers_sequence = { + llama_sampler_type::TOP_K, + llama_sampler_type::TFS_Z, + llama_sampler_type::TYPICAL_P, + llama_sampler_type::TOP_P, + llama_sampler_type::MIN_P, + llama_sampler_type::TEMPERATURE + }; + + std::string grammar; // optional BNF-like grammar to constrain sampling + + // Classifier-Free Guidance + // https://arxiv.org/abs/2306.17806 + std::string cfg_negative_prompt; // string to help guidance + float cfg_scale = 1.f; // how strong is guidance + + std::unordered_map logit_bias; // logit bias for specific tokens + + std::vector penalty_prompt_tokens; + bool use_penalty_prompt_tokens = false; +} llama_sampling_params; + +// general sampler context +// TODO: move to llama.h +struct llama_sampling_context { + // parameters that will be used for sampling + llama_sampling_params params; + + // mirostat sampler state + float mirostat_mu; + + llama_grammar * grammar; + + // internal + grammar_parser::parse_state parsed_grammar; + + // TODO: replace with ring-buffer + std::vector prev; + std::vector cur; + size_t n_valid; // Number of correct top tokens with correct probabilities. + + std::mt19937 rng; +}; + +#include "common.h" + +// Create a new sampling context instance. +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params); + +void llama_sampling_free(struct llama_sampling_context * ctx); + +// Reset the sampler context +// - clear prev tokens +// - reset grammar +void llama_sampling_reset(llama_sampling_context * ctx); + +// Set the sampler seed +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); + +// Copy the sampler context +void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); + +// Get the last sampled token +llama_token llama_sampling_last(llama_sampling_context * ctx); + +// Get a string representation of the last sampled tokens +std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n); + +// Print sampling parameters into a string +std::string llama_sampling_print(const llama_sampling_params & params); + +// Print sampling order into a string +std::string llama_sampling_order_print(const llama_sampling_params & params); + +std::string llama_sampling_type_to_str(llama_sampler_type sampler_type); + +std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector llama_sampling_types_from_chars(const std::string & names_string); + +// this is a common sampling function used across the examples for convenience +// it can serve as a starting point for implementing your own sampling function +// Note: When using multiple sequences, it is the caller's responsibility to call +// llama_sampling_reset when a sequence ends // -// - grammar support -// - custom sampler logic based on the parameters -// - history of the last accepted tokens -// - performance metrics +// required: +// - ctx_main: context to use for sampling +// - ctx_sampling: sampling-specific context // -// This goal is to have a common implementation of the sampling logic shared across the examples. -// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more -// complex (top-k, top-p, etc). +// optional: +// - ctx_cfg: context to use for classifier-free guidance +// - idx: sample from llama_get_logits_ith(ctx, idx) // -// Another example is related to the grammar. In general, the grammar constraints applied on the full -// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled -// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the -// grammar constraints are applied to the full vocabulary and the token is resampled. -// -// The common_sampler also maintains a container with the last accepted tokens. In the future, this can -// be moved into the core llama library. -// -// For convenience, the common_sampler also maintains a container with the current candidate tokens. -// This can be used to access the probabilities of the rest of the non-sampled tokens. -// -// TODO: measure grammar performance +// returns: +// - token: sampled token +// - candidates: vector of candidate tokens // +llama_token llama_sampling_sample( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + int idx = -1); -struct common_sampler; +// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. +llama_token_data_array llama_sampling_prepare( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + int idx = 0, + bool apply_grammar = true, + std::vector * original_logits = nullptr); -// llama_sampler API overloads - -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params); - -void common_sampler_free(struct common_sampler * gsmpl); - -// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); -void common_sampler_reset (struct common_sampler * gsmpl); -struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); - -// arguments can be nullptr to skip printing -void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); - -// extended sampling implementation: -// -// - set logits -// - apply the configured sampler chain -// - check if the token fits the grammar (if any) -// - if not: resample by first applying the grammar constraints and then sampling again (slower path) -// -// if grammar_first is true, the grammar is applied before the samplers (slower) -// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar -// -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); - -// generalized version of common_sampler_sample -// -// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match -// if the sampler disagrees at some point, we stop and return the accepted tokens up to now -// -// common_sampler_sample_n(gsmpl, ctx, { idx }, {}); -// -// is equivalent to -// -// common_sampler_sample(gsmpl, ctx, idx); -// common_sampler_accept(gsmpl, token, true); -// -// requires: idxs.size() == draft.size() + 1 -// -// returns at least 1 token, up to idxs.size() -// -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); - -// assume idxs == [ 0, 1, 2, ..., draft.size() ] -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); - -uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); - -// helpers - -// access the internal list of current candidate tokens -llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); - -// get the last accepted token -llama_token common_sampler_last(const struct common_sampler * gsmpl); - -// print the sampler chain into a string -std::string common_sampler_print(const struct common_sampler * gsmpl); - -// get a string representation of the last accepted tokens -std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); - -char common_sampler_type_to_chr(enum common_sampler_type cnstr); -std::string common_sampler_type_to_str(enum common_sampler_type cnstr); - -std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector common_sampler_types_from_chars(const std::string & chars); - -llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, - const char * grammar_kind, const char * grammar_data); +void llama_sampling_accept( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + llama_token id, + bool apply_grammar); diff --git a/common/speculative.cpp b/common/speculative.cpp deleted file mode 100644 index 318e96ea3..000000000 --- a/common/speculative.cpp +++ /dev/null @@ -1,277 +0,0 @@ -#include "speculative.h" - -#include "log.h" -#include "common.h" -#include "sampling.h" - -#include - -#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 -#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 - -struct common_speculative { - struct llama_context * ctx; - struct common_sampler * smpl; - - llama_batch batch; - llama_tokens prompt; -}; - -struct common_speculative * common_speculative_init( - struct llama_context * ctx_dft) { - auto * result = new common_speculative { - /* .ctx = */ ctx_dft, - /* .smpl = */ nullptr, - /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), - /* .prompt = */ {}, - }; - - // TODO: optimize or pass from outside? -#if 0 - { - common_params_sampling params; - params.no_perf = false; - - params.top_k = 40; - params.top_p = 0.9; - - params.samplers = { - COMMON_SAMPLER_TYPE_TOP_K, - COMMON_SAMPLER_TYPE_TOP_P, - COMMON_SAMPLER_TYPE_INFILL, - }; - - result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); - } -#else - { - common_params_sampling params; - params.no_perf = false; - - params.top_k = 10; - - params.samplers = { - COMMON_SAMPLER_TYPE_TOP_K, - }; - - result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); - } -#endif - - return result; -} - -void common_speculative_free(struct common_speculative * spec) { - if (spec == nullptr) { - return; - } - - common_sampler_free(spec->smpl); - - llama_batch_free(spec->batch); - - delete spec; -} - -bool common_speculative_are_compatible( - const struct llama_context * ctx_tgt, - const struct llama_context * ctx_dft) { - const struct llama_model * model_tgt = llama_get_model(ctx_tgt); - const struct llama_model * model_dft = llama_get_model(ctx_dft); - - const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); - const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); - - const bool vocab_type_tgt = llama_vocab_type(vocab_tgt); - LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); - - const bool vocab_type_dft = llama_vocab_type(vocab_dft); - LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); - - if (vocab_type_tgt != vocab_type_dft) { - LOG_ERR("%s: draft model vocab type must match target model to use speculation but " - "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt); - return false; - } - - if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) || - llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) || - llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) || - llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) { - LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__); - LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt)); - LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft)); - return false; - } - - { - const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt); - const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft); - - const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); - - if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { - LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " - "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); - return false; - } - - for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { - const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i); - const char * token_text_dft = llama_vocab_get_text(vocab_dft, i); - if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but " - "token %d content differs - target '%s', draft '%s'\n", __func__, i, - common_token_to_piece(ctx_tgt, i).c_str(), - common_token_to_piece(ctx_dft, i).c_str()); - return false; - } - } - } - - return true; -} - -llama_tokens common_speculative_gen_draft( - struct common_speculative * spec, - struct common_speculative_params params, - const llama_tokens & prompt_tgt, - llama_token id_last) { - auto & batch = spec->batch; - auto & ctx = spec->ctx; - auto & smpl = spec->smpl; - auto & prompt = spec->prompt; - - int reuse_i = 0; - int reuse_n = 0; - - const int n_ctx = llama_n_ctx(ctx) - params.n_draft; - - const int i_start = std::max(0, (int) prompt_tgt.size() - n_ctx); - - // reuse as much as possible from the old draft context - // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt - for (int i = 0; i < (int) prompt.size(); ++i) { - int cur = 0; - while (i_start + cur < (int) prompt_tgt.size() && - i + cur < (int) prompt.size() && - prompt_tgt[i_start + cur] == prompt[i + cur]) { - cur++; - } - - if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) { - reuse_i = i; - reuse_n = cur; - } - } - - LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); - - llama_tokens result; - result.reserve(params.n_draft); - - if (reuse_n == 0) { - llama_kv_cache_clear(ctx); - - prompt.clear(); - } else { - // this happens when a previous draft has been discarded (for example, due to being too small), but the - // target model agreed with it. in this case, we simply pass back the previous results to save compute - if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { - for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { - result.push_back(prompt[i]); - - if (params.n_draft <= (int) result.size()) { - break; - } - } - - return result; - } - - if (reuse_i > 0) { - llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); - llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); - - prompt.erase(prompt.begin(), prompt.begin() + reuse_i); - } - - if (reuse_n < (int) prompt.size()) { - llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); - - prompt.erase(prompt.begin() + reuse_n, prompt.end()); - } - } - - // prepare a batch to evaluate any new tokens in the prompt - common_batch_clear(batch); - - for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) { - //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); - common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); - - prompt.push_back(prompt_tgt[i]); - } - - // we should rarely end-up here during normal decoding - if (batch.n_tokens > 0) { - //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); - - llama_decode(ctx, batch); - } - - const llama_pos n_past = prompt.size(); - - LOG_DBG("%s: n_past = %d\n", __func__, n_past); - - common_batch_clear(batch); - common_batch_add (batch, id_last, n_past, { 0 }, true); - - prompt.push_back(id_last); - - //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str()); - - llama_decode(ctx, batch); - - common_sampler_reset(smpl); - - // sample n_draft tokens from the draft model - for (int i = 0; i < params.n_draft; ++i) { - common_batch_clear(batch); - - common_sampler_sample(smpl, ctx, 0, true); - - const auto * cur_p = common_sampler_get_candidates(smpl); - - for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { - LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str()); - } - - // add drafted token for each sequence - const llama_token id = cur_p->data[0].id; - - // only collect very high-confidence draft tokens - if (cur_p->data[0].p < params.p_min) { - break; - } - - common_sampler_accept(smpl, id, true); - - result.push_back(id); - - if (params.n_draft <= (int) result.size()) { - break; - } - - common_batch_add(batch, id, n_past + i + 1, { 0 }, true); - - // evaluate the drafted tokens on the draft model - llama_decode(ctx, batch); - - prompt.push_back(id); - } - - return result; -} diff --git a/common/speculative.h b/common/speculative.h deleted file mode 100644 index 50ec03446..000000000 --- a/common/speculative.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "llama.h" -#include "common.h" - -struct common_speculative; - -struct common_speculative_params { - int n_draft = 16; // max drafted tokens - int n_reuse = 256; - - float p_min = 0.9f; // min probabiliy required to accept a token in the draft -}; - -struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); - -void common_speculative_free(struct common_speculative * spec); - -bool common_speculative_are_compatible( - const struct llama_context * ctx_tgt, - const struct llama_context * ctx_dft); - -// sample up to n_draft tokens and add them to the batch using the draft model -llama_tokens common_speculative_gen_draft( - struct common_speculative * spec, - struct common_speculative_params params, - const llama_tokens & prompt, - llama_token id_last); diff --git a/common/stb_image.h b/common/stb_image.h index 9eedabedc..4766d7e67 100644 --- a/common/stb_image.h +++ b/common/stb_image.h @@ -1,4 +1,4 @@ -/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb +/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb no warranty implied; use at your own risk Do this: @@ -48,8 +48,6 @@ LICENSE RECENT REVISION HISTORY: - 2.30 (2024-05-31) avoid erroneous gcc warning - 2.29 (2023-05-xx) optimizations 2.28 (2023-01-29) many error fixes, security errors, just tons of stuff 2.27 (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes 2.26 (2020-07-13) many minor fixes @@ -373,14 +371,13 @@ RECENT REVISION HISTORY: #define STBI_VERSION 1 -enum -{ - STBI_default = 0, // only used for desired_channels +enum { + STBI_default = 0, // only used for desired_channels - STBI_grey = 1, - STBI_grey_alpha = 2, - STBI_rgb = 3, - STBI_rgb_alpha = 4 + STBI_grey = 1, + STBI_grey_alpha = 2, + STBI_rgb = 3, + STBI_rgb_alpha = 4 }; #include @@ -408,11 +405,11 @@ extern "C" { // load image by filename, open file, or memory buffer // -typedef struct -{ - int (*read) (void *user,char *data,int size); // fill 'data' with 'size' bytes. return number of bytes actually read - void (*skip) (void *user,int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative - int (*eof) (void *user); // returns nonzero if we are at end of file/data +typedef struct { + int (*read)(void * user, char * data, + int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip)(void * user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof)(void * user); // returns nonzero if we are at end of file/data } stbi_io_callbacks; //////////////////////////////////// @@ -420,21 +417,24 @@ typedef struct // 8-bits-per-channel interface // -STBIDEF stbi_uc *stbi_load_from_memory (stbi_uc const *buffer, int len , int *x, int *y, int *channels_in_file, int desired_channels); -STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk , void *user, int *x, int *y, int *channels_in_file, int desired_channels); +STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file, + int desired_channels); +STBIDEF stbi_uc * stbi_load_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, + int * channels_in_file, int desired_channels); #ifndef STBI_NO_STDIO -STBIDEF stbi_uc *stbi_load (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels); -STBIDEF stbi_uc *stbi_load_from_file (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels); +STBIDEF stbi_uc * stbi_load(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels); +STBIDEF stbi_uc * stbi_load_from_file(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels); // for stbi_load_from_file, file pointer is left pointing immediately after image #endif #ifndef STBI_NO_GIF -STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp); +STBIDEF stbi_uc * stbi_load_gif_from_memory(stbi_uc const * buffer, int len, int ** delays, int * x, int * y, int * z, + int * comp, int req_comp); #endif #ifdef STBI_WINDOWS_UTF8 -STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input); +STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wchar_t * input); #endif //////////////////////////////////// @@ -442,12 +442,14 @@ STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wch // 16-bits-per-channel interface // -STBIDEF stbi_us *stbi_load_16_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels); -STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels); +STBIDEF stbi_us * stbi_load_16_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file, + int desired_channels); +STBIDEF stbi_us * stbi_load_16_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, + int * channels_in_file, int desired_channels); #ifndef STBI_NO_STDIO -STBIDEF stbi_us *stbi_load_16 (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels); -STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels); +STBIDEF stbi_us * stbi_load_16(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels); +STBIDEF stbi_us * stbi_load_from_file_16(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels); #endif //////////////////////////////////// @@ -455,56 +457,55 @@ STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_i // float-per-channel interface // #ifndef STBI_NO_LINEAR - STBIDEF float *stbi_loadf_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels); - STBIDEF float *stbi_loadf_from_callbacks (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels); +STBIDEF float * stbi_loadf_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file, + int desired_channels); +STBIDEF float * stbi_loadf_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * channels_in_file, + int desired_channels); - #ifndef STBI_NO_STDIO - STBIDEF float *stbi_loadf (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels); - STBIDEF float *stbi_loadf_from_file (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels); - #endif +#ifndef STBI_NO_STDIO +STBIDEF float * stbi_loadf(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels); +STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels); +#endif #endif #ifndef STBI_NO_HDR - STBIDEF void stbi_hdr_to_ldr_gamma(float gamma); - STBIDEF void stbi_hdr_to_ldr_scale(float scale); +STBIDEF void stbi_hdr_to_ldr_gamma(float gamma); +STBIDEF void stbi_hdr_to_ldr_scale(float scale); #endif // STBI_NO_HDR #ifndef STBI_NO_LINEAR - STBIDEF void stbi_ldr_to_hdr_gamma(float gamma); - STBIDEF void stbi_ldr_to_hdr_scale(float scale); +STBIDEF void stbi_ldr_to_hdr_gamma(float gamma); +STBIDEF void stbi_ldr_to_hdr_scale(float scale); #endif // STBI_NO_LINEAR // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR -STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user); -STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len); +STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const * clbk, void * user); +STBIDEF int stbi_is_hdr_from_memory(stbi_uc const * buffer, int len); #ifndef STBI_NO_STDIO -STBIDEF int stbi_is_hdr (char const *filename); -STBIDEF int stbi_is_hdr_from_file(FILE *f); +STBIDEF int stbi_is_hdr(char const * filename); +STBIDEF int stbi_is_hdr_from_file(FILE * f); #endif // STBI_NO_STDIO - // get a VERY brief reason for failure // on most compilers (and ALL modern mainstream compilers) this is threadsafe -STBIDEF const char *stbi_failure_reason (void); +STBIDEF const char * stbi_failure_reason(void); // free the loaded image -- this is just free() -STBIDEF void stbi_image_free (void *retval_from_stbi_load); +STBIDEF void stbi_image_free(void * retval_from_stbi_load); // get image dimensions & components without fully decoding -STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp); -STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp); -STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len); -STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user); +STBIDEF int stbi_info_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp); +STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp); +STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const * buffer, int len); +STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * clbk, void * user); #ifndef STBI_NO_STDIO -STBIDEF int stbi_info (char const *filename, int *x, int *y, int *comp); -STBIDEF int stbi_info_from_file (FILE *f, int *x, int *y, int *comp); -STBIDEF int stbi_is_16_bit (char const *filename); -STBIDEF int stbi_is_16_bit_from_file(FILE *f); +STBIDEF int stbi_info(char const * filename, int * x, int * y, int * comp); +STBIDEF int stbi_info_from_file(FILE * f, int * x, int * y, int * comp); +STBIDEF int stbi_is_16_bit(char const * filename); +STBIDEF int stbi_is_16_bit_from_file(FILE * f); #endif - - // for image formats that explicitly notate that they have premultiplied alpha, // we just return the colors as stored in the file. set this flag to force // unpremultiplication. results are undefined if the unpremultiply overflow. @@ -526,14 +527,14 @@ STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_fli // ZLIB client - used by PNG, available for other purposes -STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen); -STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header); -STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen); -STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen); - -STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen); -STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen); +STBIDEF char * stbi_zlib_decode_malloc_guesssize(const char * buffer, int len, int initial_size, int * outlen); +STBIDEF char * stbi_zlib_decode_malloc_guesssize_headerflag(const char * buffer, int len, int initial_size, int * outlen, + int parse_header); +STBIDEF char * stbi_zlib_decode_malloc(const char * buffer, int len, int * outlen); +STBIDEF int stbi_zlib_decode_buffer(char * obuffer, int olen, const char * ibuffer, int ilen); +STBIDEF char * stbi_zlib_decode_noheader_malloc(const char * buffer, int len, int * outlen); +STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const char * ibuffer, int ilen); #ifdef __cplusplus } @@ -546,52 +547,50 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch #ifdef STB_IMAGE_IMPLEMENTATION -#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \ - || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \ - || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \ - || defined(STBI_ONLY_ZLIB) - #ifndef STBI_ONLY_JPEG - #define STBI_NO_JPEG - #endif - #ifndef STBI_ONLY_PNG - #define STBI_NO_PNG - #endif - #ifndef STBI_ONLY_BMP - #define STBI_NO_BMP - #endif - #ifndef STBI_ONLY_PSD - #define STBI_NO_PSD - #endif - #ifndef STBI_ONLY_TGA - #define STBI_NO_TGA - #endif - #ifndef STBI_ONLY_GIF - #define STBI_NO_GIF - #endif - #ifndef STBI_ONLY_HDR - #define STBI_NO_HDR - #endif - #ifndef STBI_ONLY_PIC - #define STBI_NO_PIC - #endif - #ifndef STBI_ONLY_PNM - #define STBI_NO_PNM - #endif +#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || \ + defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || \ + defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB) +#ifndef STBI_ONLY_JPEG +#define STBI_NO_JPEG +#endif +#ifndef STBI_ONLY_PNG +#define STBI_NO_PNG +#endif +#ifndef STBI_ONLY_BMP +#define STBI_NO_BMP +#endif +#ifndef STBI_ONLY_PSD +#define STBI_NO_PSD +#endif +#ifndef STBI_ONLY_TGA +#define STBI_NO_TGA +#endif +#ifndef STBI_ONLY_GIF +#define STBI_NO_GIF +#endif +#ifndef STBI_ONLY_HDR +#define STBI_NO_HDR +#endif +#ifndef STBI_ONLY_PIC +#define STBI_NO_PIC +#endif +#ifndef STBI_ONLY_PNM +#define STBI_NO_PNM +#endif #endif #if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB) #define STBI_NO_ZLIB #endif - +#include #include #include // ptrdiff_t on osx #include #include -#include #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) -#include // ldexp, pow +#include // ldexp, pow #endif #ifndef STBI_NO_STDIO @@ -609,55 +608,54 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch #define STBI_EXTERN extern #endif - #ifndef _MSC_VER - #ifdef __cplusplus - #define stbi_inline inline - #else - #define stbi_inline - #endif +#ifdef __cplusplus +#define stbi_inline inline #else - #define stbi_inline __forceinline +#define stbi_inline +#endif +#else +#define stbi_inline __forceinline #endif #ifndef STBI_NO_THREAD_LOCALS - #if defined(__cplusplus) && __cplusplus >= 201103L - #define STBI_THREAD_LOCAL thread_local - #elif defined(__GNUC__) && __GNUC__ < 5 - #define STBI_THREAD_LOCAL __thread - #elif defined(_MSC_VER) - #define STBI_THREAD_LOCAL __declspec(thread) - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) - #define STBI_THREAD_LOCAL _Thread_local - #endif +#if defined(__cplusplus) && __cplusplus >= 201103L +#define STBI_THREAD_LOCAL thread_local +#elif defined(__GNUC__) && __GNUC__ < 5 +#define STBI_THREAD_LOCAL __thread +#elif defined(_MSC_VER) +#define STBI_THREAD_LOCAL __declspec(thread) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) +#define STBI_THREAD_LOCAL _Thread_local +#endif - #ifndef STBI_THREAD_LOCAL - #if defined(__GNUC__) - #define STBI_THREAD_LOCAL __thread - #endif - #endif +#ifndef STBI_THREAD_LOCAL +#if defined(__GNUC__) +#define STBI_THREAD_LOCAL __thread +#endif +#endif #endif #if defined(_MSC_VER) || defined(__SYMBIAN32__) typedef unsigned short stbi__uint16; -typedef signed short stbi__int16; -typedef unsigned int stbi__uint32; -typedef signed int stbi__int32; +typedef signed short stbi__int16; +typedef unsigned int stbi__uint32; +typedef signed int stbi__int32; #else #include typedef uint16_t stbi__uint16; -typedef int16_t stbi__int16; +typedef int16_t stbi__int16; typedef uint32_t stbi__uint32; -typedef int32_t stbi__int32; +typedef int32_t stbi__int32; #endif // should produce compiler error if size is wrong -typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; +typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; #ifdef _MSC_VER -#define STBI_NOTUSED(v) (void)(v) +#define STBI_NOTUSED(v) (void)(v) #else -#define STBI_NOTUSED(v) (void)sizeof(v) +#define STBI_NOTUSED(v) (void)sizeof(v) #endif #ifdef _MSC_VER @@ -665,9 +663,9 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #endif #ifdef STBI_HAS_LROTL - #define stbi_lrot(x,y) _lrotl(x,y) +#define stbi_lrot(x, y) _lrotl(x, y) #else - #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (-(y) & 31))) +#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (-(y)&31))) #endif #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED)) @@ -679,13 +677,13 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #endif #ifndef STBI_MALLOC -#define STBI_MALLOC(sz) malloc(sz) -#define STBI_REALLOC(p,newsz) realloc(p,newsz) -#define STBI_FREE(p) free(p) +#define STBI_MALLOC(sz) malloc(sz) +#define STBI_REALLOC(p, newsz) realloc(p, newsz) +#define STBI_FREE(p) free(p) #endif #ifndef STBI_REALLOC_SIZED -#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz) +#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz) #endif // x86/x64 detection @@ -727,34 +725,31 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #ifdef _MSC_VER -#if _MSC_VER >= 1400 // not VC6 -#include // __cpuid -static int stbi__cpuid3(void) -{ - int info[4]; - __cpuid(info,1); - return info[3]; +#if _MSC_VER >= 1400 // not VC6 +#include // __cpuid +static int stbi__cpuid3(void) { + int info[4]; + __cpuid(info, 1); + return info[3]; } #else -static int stbi__cpuid3(void) -{ - int res; - __asm { +static int stbi__cpuid3(void) { + int res; + __asm { mov eax,1 cpuid mov res,edx - } - return res; + } + return res; } #endif #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2) -static int stbi__sse2_available(void) -{ - int info3 = stbi__cpuid3(); - return ((info3 >> 26) & 1) != 0; +static int stbi__sse2_available(void) { + int info3 = stbi__cpuid3(); + return ((info3 >> 26) & 1) != 0; } #endif @@ -762,12 +757,11 @@ static int stbi__sse2_available(void) #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2) -static int stbi__sse2_available(void) -{ - // If we're even attempting to compile this on GCC/Clang, that means - // -msse2 is on, which means the compiler is allowed to use SSE2 - // instructions at will, and so are we. - return 1; +static int stbi__sse2_available(void) { + // If we're even attempting to compile this on GCC/Clang, that means + // -msse2 is on, which means the compiler is allowed to use SSE2 + // instructions at will, and so are we. + return 1; } #endif @@ -802,190 +796,162 @@ static int stbi__sse2_available(void) // stbi__context structure is our basic context used by all images, so it // contains all the IO context, plus some basic image information -typedef struct -{ - stbi__uint32 img_x, img_y; - int img_n, img_out_n; +typedef struct { + stbi__uint32 img_x, img_y; + int img_n, img_out_n; - stbi_io_callbacks io; - void *io_user_data; + stbi_io_callbacks io; + void * io_user_data; - int read_from_callbacks; - int buflen; - stbi_uc buffer_start[128]; - int callback_already_read; + int read_from_callbacks; + int buflen; + stbi_uc buffer_start[128]; + int callback_already_read; - stbi_uc *img_buffer, *img_buffer_end; - stbi_uc *img_buffer_original, *img_buffer_original_end; + stbi_uc *img_buffer, *img_buffer_end; + stbi_uc *img_buffer_original, *img_buffer_original_end; } stbi__context; - -static void stbi__refill_buffer(stbi__context *s); +static void stbi__refill_buffer(stbi__context * s); // initialize a memory-decode context -static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len) -{ - s->io.read = NULL; - s->read_from_callbacks = 0; - s->callback_already_read = 0; - s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer; - s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len; +static void stbi__start_mem(stbi__context * s, stbi_uc const * buffer, int len) { + s->io.read = NULL; + s->read_from_callbacks = 0; + s->callback_already_read = 0; + s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer; + s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len; } // initialize a callback-based context -static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user) -{ - s->io = *c; - s->io_user_data = user; - s->buflen = sizeof(s->buffer_start); - s->read_from_callbacks = 1; - s->callback_already_read = 0; - s->img_buffer = s->img_buffer_original = s->buffer_start; - stbi__refill_buffer(s); - s->img_buffer_original_end = s->img_buffer_end; +static void stbi__start_callbacks(stbi__context * s, stbi_io_callbacks * c, void * user) { + s->io = *c; + s->io_user_data = user; + s->buflen = sizeof(s->buffer_start); + s->read_from_callbacks = 1; + s->callback_already_read = 0; + s->img_buffer = s->img_buffer_original = s->buffer_start; + stbi__refill_buffer(s); + s->img_buffer_original_end = s->img_buffer_end; } #ifndef STBI_NO_STDIO -static int stbi__stdio_read(void *user, char *data, int size) -{ - return (int) fread(data,1,size,(FILE*) user); +static int stbi__stdio_read(void * user, char * data, int size) { return (int)fread(data, 1, size, (FILE *)user); } + +static void stbi__stdio_skip(void * user, int n) { + int ch; + fseek((FILE *)user, n, SEEK_CUR); + ch = fgetc((FILE *)user); /* have to read a byte to reset feof()'s flag */ + if (ch != EOF) { + ungetc(ch, (FILE *)user); /* push byte back onto stream if valid. */ + } } -static void stbi__stdio_skip(void *user, int n) -{ - int ch; - fseek((FILE*) user, n, SEEK_CUR); - ch = fgetc((FILE*) user); /* have to read a byte to reset feof()'s flag */ - if (ch != EOF) { - ungetc(ch, (FILE *) user); /* push byte back onto stream if valid. */ - } -} +static int stbi__stdio_eof(void * user) { return feof((FILE *)user) || ferror((FILE *)user); } -static int stbi__stdio_eof(void *user) -{ - return feof((FILE*) user) || ferror((FILE *) user); -} - -static stbi_io_callbacks stbi__stdio_callbacks = -{ - stbi__stdio_read, - stbi__stdio_skip, - stbi__stdio_eof, +static stbi_io_callbacks stbi__stdio_callbacks = { + stbi__stdio_read, + stbi__stdio_skip, + stbi__stdio_eof, }; -static void stbi__start_file(stbi__context *s, FILE *f) -{ - stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f); -} +static void stbi__start_file(stbi__context * s, FILE * f) { stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f); } -//static void stop_file(stbi__context *s) { } +// static void stop_file(stbi__context *s) { } #endif // !STBI_NO_STDIO -static void stbi__rewind(stbi__context *s) -{ - // conceptually rewind SHOULD rewind to the beginning of the stream, - // but we just rewind to the beginning of the initial buffer, because - // we only use it after doing 'test', which only ever looks at at most 92 bytes - s->img_buffer = s->img_buffer_original; - s->img_buffer_end = s->img_buffer_original_end; +static void stbi__rewind(stbi__context * s) { + // conceptually rewind SHOULD rewind to the beginning of the stream, + // but we just rewind to the beginning of the initial buffer, because + // we only use it after doing 'test', which only ever looks at at most 92 bytes + s->img_buffer = s->img_buffer_original; + s->img_buffer_end = s->img_buffer_original_end; } -enum -{ - STBI_ORDER_RGB, - STBI_ORDER_BGR -}; +enum { STBI_ORDER_RGB, STBI_ORDER_BGR }; -typedef struct -{ - int bits_per_channel; - int num_channels; - int channel_order; +typedef struct { + int bits_per_channel; + int num_channels; + int channel_order; } stbi__result_info; #ifndef STBI_NO_JPEG -static int stbi__jpeg_test(stbi__context *s); -static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__jpeg_test(stbi__context * s); +static void * stbi__jpeg_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp); #endif #ifndef STBI_NO_PNG -static int stbi__png_test(stbi__context *s); -static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp); -static int stbi__png_is16(stbi__context *s); +static int stbi__png_test(stbi__context * s); +static void * stbi__png_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__png_info(stbi__context * s, int * x, int * y, int * comp); +static int stbi__png_is16(stbi__context * s); #endif #ifndef STBI_NO_BMP -static int stbi__bmp_test(stbi__context *s); -static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__bmp_test(stbi__context * s); +static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__bmp_info(stbi__context * s, int * x, int * y, int * comp); #endif #ifndef STBI_NO_TGA -static int stbi__tga_test(stbi__context *s); -static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__tga_test(stbi__context * s); +static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp); #endif #ifndef STBI_NO_PSD -static int stbi__psd_test(stbi__context *s); -static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc); -static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp); -static int stbi__psd_is16(stbi__context *s); +static int stbi__psd_test(stbi__context * s); +static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc); +static int stbi__psd_info(stbi__context * s, int * x, int * y, int * comp); +static int stbi__psd_is16(stbi__context * s); #endif #ifndef STBI_NO_HDR -static int stbi__hdr_test(stbi__context *s); -static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__hdr_test(stbi__context * s); +static float * stbi__hdr_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__hdr_info(stbi__context * s, int * x, int * y, int * comp); #endif #ifndef STBI_NO_PIC -static int stbi__pic_test(stbi__context *s); -static void *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__pic_test(stbi__context * s); +static void * stbi__pic_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp); #endif #ifndef STBI_NO_GIF -static int stbi__gif_test(stbi__context *s); -static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp); -static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__gif_test(stbi__context * s); +static void * stbi__gif_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static void * stbi__load_gif_main(stbi__context * s, int ** delays, int * x, int * y, int * z, int * comp, int req_comp); +static int stbi__gif_info(stbi__context * s, int * x, int * y, int * comp); #endif #ifndef STBI_NO_PNM -static int stbi__pnm_test(stbi__context *s); -static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); -static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp); -static int stbi__pnm_is16(stbi__context *s); +static int stbi__pnm_test(stbi__context * s); +static void * stbi__pnm_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri); +static int stbi__pnm_info(stbi__context * s, int * x, int * y, int * comp); +static int stbi__pnm_is16(stbi__context * s); #endif static #ifdef STBI_THREAD_LOCAL -STBI_THREAD_LOCAL + STBI_THREAD_LOCAL #endif -const char *stbi__g_failure_reason; + const char * stbi__g_failure_reason; -STBIDEF const char *stbi_failure_reason(void) -{ - return stbi__g_failure_reason; -} +STBIDEF const char * stbi_failure_reason(void) { return stbi__g_failure_reason; } #ifndef STBI_NO_FAILURE_STRINGS -static int stbi__err(const char *str) -{ - stbi__g_failure_reason = str; - return 0; +static int stbi__err(const char * str) { + stbi__g_failure_reason = str; + return 0; } #endif -static void *stbi__malloc(size_t size) -{ - return STBI_MALLOC(size); -} +static void * stbi__malloc(size_t size) { return STBI_MALLOC(size); } // stb_image uses ints pervasively, including for offset calculations. // therefore the largest decoded image size we can support with the @@ -999,88 +965,88 @@ static void *stbi__malloc(size_t size) // return 1 if the sum is valid, 0 on overflow. // negative terms are considered invalid. -static int stbi__addsizes_valid(int a, int b) -{ - if (b < 0) return 0; - // now 0 <= b <= INT_MAX, hence also - // 0 <= INT_MAX - b <= INTMAX. - // And "a + b <= INT_MAX" (which might overflow) is the - // same as a <= INT_MAX - b (no overflow) - return a <= INT_MAX - b; +static int stbi__addsizes_valid(int a, int b) { + if (b < 0) + return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; } // returns 1 if the product is valid, 0 on overflow. // negative factors are considered invalid. -static int stbi__mul2sizes_valid(int a, int b) -{ - if (a < 0 || b < 0) return 0; - if (b == 0) return 1; // mul-by-0 is always safe - // portable way to check for no overflows in a*b - return a <= INT_MAX/b; +static int stbi__mul2sizes_valid(int a, int b) { + if (a < 0 || b < 0) + return 0; + if (b == 0) + return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; } #if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR) // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow -static int stbi__mad2sizes_valid(int a, int b, int add) -{ - return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add); +static int stbi__mad2sizes_valid(int a, int b, int add) { + return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add); } #endif // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow -static int stbi__mad3sizes_valid(int a, int b, int c, int add) -{ - return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) && - stbi__addsizes_valid(a*b*c, add); +static int stbi__mad3sizes_valid(int a, int b, int c, int add) { + return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__addsizes_valid(a * b * c, add); } // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM) -static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) -{ - return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) && - stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add); +static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) { + return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && + stbi__addsizes_valid(a * b * c * d, add); } #endif #if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR) // mallocs with size overflow checking -static void *stbi__malloc_mad2(int a, int b, int add) -{ - if (!stbi__mad2sizes_valid(a, b, add)) return NULL; - return stbi__malloc(a*b + add); +static void * stbi__malloc_mad2(int a, int b, int add) { + if (!stbi__mad2sizes_valid(a, b, add)) + return NULL; + return stbi__malloc(a * b + add); } #endif -static void *stbi__malloc_mad3(int a, int b, int c, int add) -{ - if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL; - return stbi__malloc(a*b*c + add); +static void * stbi__malloc_mad3(int a, int b, int c, int add) { + if (!stbi__mad3sizes_valid(a, b, c, add)) + return NULL; + return stbi__malloc(a * b * c + add); } #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM) -static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) -{ - if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL; - return stbi__malloc(a*b*c*d + add); +static void * stbi__malloc_mad4(int a, int b, int c, int d, int add) { + if (!stbi__mad4sizes_valid(a, b, c, d, add)) + return NULL; + return stbi__malloc(a * b * c * d + add); } #endif // returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow. -static int stbi__addints_valid(int a, int b) -{ - if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow - if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0. - return a <= INT_MAX - b; +static int stbi__addints_valid(int a, int b) { + if ((a >= 0) != (b >= 0)) + return 1; // a and b have different signs, so no overflow + if (a < 0 && b < 0) + return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0. + return a <= INT_MAX - b; } -// returns 1 if the product of two ints fits in a signed short, 0 on overflow. -static int stbi__mul2shorts_valid(int a, int b) -{ - if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow - if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid - if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN - return a >= SHRT_MIN / b; +// returns 1 if the product of two signed shorts is valid, 0 on overflow. +static int stbi__mul2shorts_valid(short a, short b) { + if (b == 0 || b == -1) + return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow + if ((a >= 0) == (b >= 0)) + return a <= SHRT_MAX / b; // product is positive, so similar to mul2sizes_valid + if (b < 0) + return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN + return a >= SHRT_MIN / b; } // stbi__err - error @@ -1088,423 +1054,411 @@ static int stbi__mul2shorts_valid(int a, int b) // stbi__errpuc - error returning pointer to unsigned char #ifdef STBI_NO_FAILURE_STRINGS - #define stbi__err(x,y) 0 +#define stbi__err(x, y) 0 #elif defined(STBI_FAILURE_USERMSG) - #define stbi__err(x,y) stbi__err(y) +#define stbi__err(x, y) stbi__err(y) #else - #define stbi__err(x,y) stbi__err(x) +#define stbi__err(x, y) stbi__err(x) #endif -#define stbi__errpf(x,y) ((float *)(size_t) (stbi__err(x,y)?NULL:NULL)) -#define stbi__errpuc(x,y) ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL)) +#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL)) +#define stbi__errpuc(x, y) ((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL)) -STBIDEF void stbi_image_free(void *retval_from_stbi_load) -{ - STBI_FREE(retval_from_stbi_load); -} +STBIDEF void stbi_image_free(void * retval_from_stbi_load) { STBI_FREE(retval_from_stbi_load); } #ifndef STBI_NO_LINEAR -static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp); +static float * stbi__ldr_to_hdr(stbi_uc * data, int x, int y, int comp); #endif #ifndef STBI_NO_HDR -static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp); +static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp); #endif static int stbi__vertically_flip_on_load_global = 0; -STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) -{ - stbi__vertically_flip_on_load_global = flag_true_if_should_flip; +STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) { + stbi__vertically_flip_on_load_global = flag_true_if_should_flip; } #ifndef STBI_THREAD_LOCAL -#define stbi__vertically_flip_on_load stbi__vertically_flip_on_load_global +#define stbi__vertically_flip_on_load stbi__vertically_flip_on_load_global #else static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set; -STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip) -{ - stbi__vertically_flip_on_load_local = flag_true_if_should_flip; - stbi__vertically_flip_on_load_set = 1; +STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip) { + stbi__vertically_flip_on_load_local = flag_true_if_should_flip; + stbi__vertically_flip_on_load_set = 1; } -#define stbi__vertically_flip_on_load (stbi__vertically_flip_on_load_set \ - ? stbi__vertically_flip_on_load_local \ - : stbi__vertically_flip_on_load_global) +#define stbi__vertically_flip_on_load \ + (stbi__vertically_flip_on_load_set ? stbi__vertically_flip_on_load_local : stbi__vertically_flip_on_load_global) #endif // STBI_THREAD_LOCAL -static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc) -{ - memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields - ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed - ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order - ri->num_channels = 0; +static void * stbi__load_main(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc) { + memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields + ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed + ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order + ri->num_channels = 0; - // test the formats with a very explicit header first (at least a FOURCC - // or distinctive magic number first) - #ifndef STBI_NO_PNG - if (stbi__png_test(s)) return stbi__png_load(s,x,y,comp,req_comp, ri); - #endif - #ifndef STBI_NO_BMP - if (stbi__bmp_test(s)) return stbi__bmp_load(s,x,y,comp,req_comp, ri); - #endif - #ifndef STBI_NO_GIF - if (stbi__gif_test(s)) return stbi__gif_load(s,x,y,comp,req_comp, ri); - #endif - #ifndef STBI_NO_PSD - if (stbi__psd_test(s)) return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc); - #else - STBI_NOTUSED(bpc); - #endif - #ifndef STBI_NO_PIC - if (stbi__pic_test(s)) return stbi__pic_load(s,x,y,comp,req_comp, ri); - #endif +// test the formats with a very explicit header first (at least a FOURCC +// or distinctive magic number first) +#ifndef STBI_NO_PNG + if (stbi__png_test(s)) + return stbi__png_load(s, x, y, comp, req_comp, ri); +#endif +#ifndef STBI_NO_BMP + if (stbi__bmp_test(s)) + return stbi__bmp_load(s, x, y, comp, req_comp, ri); +#endif +#ifndef STBI_NO_GIF + if (stbi__gif_test(s)) + return stbi__gif_load(s, x, y, comp, req_comp, ri); +#endif +#ifndef STBI_NO_PSD + if (stbi__psd_test(s)) + return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc); +#else + STBI_NOTUSED(bpc); +#endif +#ifndef STBI_NO_PIC + if (stbi__pic_test(s)) + return stbi__pic_load(s, x, y, comp, req_comp, ri); +#endif - // then the formats that can end up attempting to load with just 1 or 2 - // bytes matching expectations; these are prone to false positives, so - // try them later - #ifndef STBI_NO_JPEG - if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri); - #endif - #ifndef STBI_NO_PNM - if (stbi__pnm_test(s)) return stbi__pnm_load(s,x,y,comp,req_comp, ri); - #endif +// then the formats that can end up attempting to load with just 1 or 2 +// bytes matching expectations; these are prone to false positives, so +// try them later +#ifndef STBI_NO_JPEG + if (stbi__jpeg_test(s)) + return stbi__jpeg_load(s, x, y, comp, req_comp, ri); +#endif +#ifndef STBI_NO_PNM + if (stbi__pnm_test(s)) + return stbi__pnm_load(s, x, y, comp, req_comp, ri); +#endif - #ifndef STBI_NO_HDR - if (stbi__hdr_test(s)) { - float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri); - return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp); - } - #endif +#ifndef STBI_NO_HDR + if (stbi__hdr_test(s)) { + float * hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri); + return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp); + } +#endif - #ifndef STBI_NO_TGA - // test tga last because it's a crappy test! - if (stbi__tga_test(s)) - return stbi__tga_load(s,x,y,comp,req_comp, ri); - #endif +#ifndef STBI_NO_TGA + // test tga last because it's a crappy test! + if (stbi__tga_test(s)) + return stbi__tga_load(s, x, y, comp, req_comp, ri); +#endif - return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt"); + return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt"); } -static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels) -{ - int i; - int img_len = w * h * channels; - stbi_uc *reduced; +static stbi_uc * stbi__convert_16_to_8(stbi__uint16 * orig, int w, int h, int channels) { + int i; + int img_len = w * h * channels; + stbi_uc * reduced; - reduced = (stbi_uc *) stbi__malloc(img_len); - if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory"); + reduced = (stbi_uc *)stbi__malloc(img_len); + if (reduced == NULL) + return stbi__errpuc("outofmem", "Out of memory"); - for (i = 0; i < img_len; ++i) - reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling + for (i = 0; i < img_len; ++i) + reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling - STBI_FREE(orig); - return reduced; + STBI_FREE(orig); + return reduced; } -static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels) -{ - int i; - int img_len = w * h * channels; - stbi__uint16 *enlarged; +static stbi__uint16 * stbi__convert_8_to_16(stbi_uc * orig, int w, int h, int channels) { + int i; + int img_len = w * h * channels; + stbi__uint16 * enlarged; - enlarged = (stbi__uint16 *) stbi__malloc(img_len*2); - if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory"); + enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2); + if (enlarged == NULL) + return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory"); - for (i = 0; i < img_len; ++i) - enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff + for (i = 0; i < img_len; ++i) + enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff - STBI_FREE(orig); - return enlarged; + STBI_FREE(orig); + return enlarged; } -static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel) -{ - int row; - size_t bytes_per_row = (size_t)w * bytes_per_pixel; - stbi_uc temp[2048]; - stbi_uc *bytes = (stbi_uc *)image; +static void stbi__vertical_flip(void * image, int w, int h, int bytes_per_pixel) { + int row; + size_t bytes_per_row = (size_t)w * bytes_per_pixel; + stbi_uc temp[2048]; + stbi_uc * bytes = (stbi_uc *)image; - for (row = 0; row < (h>>1); row++) { - stbi_uc *row0 = bytes + row*bytes_per_row; - stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row; - // swap row0 with row1 - size_t bytes_left = bytes_per_row; - while (bytes_left) { - size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp); - memcpy(temp, row0, bytes_copy); - memcpy(row0, row1, bytes_copy); - memcpy(row1, temp, bytes_copy); - row0 += bytes_copy; - row1 += bytes_copy; - bytes_left -= bytes_copy; - } - } + for (row = 0; row < (h >> 1); row++) { + stbi_uc * row0 = bytes + row * bytes_per_row; + stbi_uc * row1 = bytes + (h - row - 1) * bytes_per_row; + // swap row0 with row1 + size_t bytes_left = bytes_per_row; + while (bytes_left) { + size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp); + memcpy(temp, row0, bytes_copy); + memcpy(row0, row1, bytes_copy); + memcpy(row1, temp, bytes_copy); + row0 += bytes_copy; + row1 += bytes_copy; + bytes_left -= bytes_copy; + } + } } #ifndef STBI_NO_GIF -static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel) -{ - int slice; - int slice_size = w * h * bytes_per_pixel; +static void stbi__vertical_flip_slices(void * image, int w, int h, int z, int bytes_per_pixel) { + int slice; + int slice_size = w * h * bytes_per_pixel; - stbi_uc *bytes = (stbi_uc *)image; - for (slice = 0; slice < z; ++slice) { - stbi__vertical_flip(bytes, w, h, bytes_per_pixel); - bytes += slice_size; - } + stbi_uc * bytes = (stbi_uc *)image; + for (slice = 0; slice < z; ++slice) { + stbi__vertical_flip(bytes, w, h, bytes_per_pixel); + bytes += slice_size; + } } #endif -static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp) -{ - stbi__result_info ri; - void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8); +static unsigned char * stbi__load_and_postprocess_8bit(stbi__context * s, int * x, int * y, int * comp, int req_comp) { + stbi__result_info ri; + void * result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8); - if (result == NULL) - return NULL; + if (result == NULL) + return NULL; - // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. - STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); + // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. + STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); - if (ri.bits_per_channel != 8) { - result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp); - ri.bits_per_channel = 8; - } + if (ri.bits_per_channel != 8) { + result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y, req_comp == 0 ? *comp : req_comp); + ri.bits_per_channel = 8; + } - // @TODO: move stbi__convert_format to here + // @TODO: move stbi__convert_format to here - if (stbi__vertically_flip_on_load) { - int channels = req_comp ? req_comp : *comp; - stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc)); - } + if (stbi__vertically_flip_on_load) { + int channels = req_comp ? req_comp : *comp; + stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc)); + } - return (unsigned char *) result; + return (unsigned char *)result; } -static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp) -{ - stbi__result_info ri; - void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16); +static stbi__uint16 * stbi__load_and_postprocess_16bit(stbi__context * s, int * x, int * y, int * comp, int req_comp) { + stbi__result_info ri; + void * result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16); - if (result == NULL) - return NULL; + if (result == NULL) + return NULL; - // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. - STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); + // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. + STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); - if (ri.bits_per_channel != 16) { - result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp); - ri.bits_per_channel = 16; - } + if (ri.bits_per_channel != 16) { + result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y, req_comp == 0 ? *comp : req_comp); + ri.bits_per_channel = 16; + } - // @TODO: move stbi__convert_format16 to here - // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision + // @TODO: move stbi__convert_format16 to here + // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision - if (stbi__vertically_flip_on_load) { - int channels = req_comp ? req_comp : *comp; - stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16)); - } + if (stbi__vertically_flip_on_load) { + int channels = req_comp ? req_comp : *comp; + stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16)); + } - return (stbi__uint16 *) result; + return (stbi__uint16 *)result; } #if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR) -static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp) -{ - if (stbi__vertically_flip_on_load && result != NULL) { - int channels = req_comp ? req_comp : *comp; - stbi__vertical_flip(result, *x, *y, channels * sizeof(float)); - } +static void stbi__float_postprocess(float * result, int * x, int * y, int * comp, int req_comp) { + if (stbi__vertically_flip_on_load && result != NULL) { + int channels = req_comp ? req_comp : *comp; + stbi__vertical_flip(result, *x, *y, channels * sizeof(float)); + } } #endif #ifndef STBI_NO_STDIO #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) -STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide); -STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default); +STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char * str, + int cbmb, wchar_t * widestr, int cchwide); +STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, + const wchar_t * widestr, int cchwide, char * str, int cbmb, + const char * defchar, int * used_default); #endif #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) -STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input) -{ - return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); +STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wchar_t * input) { + return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int)bufferlen, NULL, NULL); } #endif -static FILE *stbi__fopen(char const *filename, char const *mode) -{ - FILE *f; +static FILE * stbi__fopen(char const * filename, char const * mode) { + FILE * f; #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) - wchar_t wMode[64]; - wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename))) - return 0; + wchar_t wMode[64]; + wchar_t wFilename[1024]; + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename) / sizeof(*wFilename))) + return 0; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode))) - return 0; + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode) / sizeof(*wMode))) + return 0; #if defined(_MSC_VER) && _MSC_VER >= 1400 - if (0 != _wfopen_s(&f, wFilename, wMode)) - f = 0; + if (0 != _wfopen_s(&f, wFilename, wMode)) + f = 0; #else - f = _wfopen(wFilename, wMode); + f = _wfopen(wFilename, wMode); #endif #elif defined(_MSC_VER) && _MSC_VER >= 1400 - if (0 != fopen_s(&f, filename, mode)) - f=0; + if (0 != fopen_s(&f, filename, mode)) + f = 0; #else - f = fopen(filename, mode); + f = fopen(filename, mode); #endif - return f; + return f; } - -STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp) -{ - FILE *f = stbi__fopen(filename, "rb"); - unsigned char *result; - if (!f) return stbi__errpuc("can't fopen", "Unable to open file"); - result = stbi_load_from_file(f,x,y,comp,req_comp); - fclose(f); - return result; +STBIDEF stbi_uc * stbi_load(char const * filename, int * x, int * y, int * comp, int req_comp) { + FILE * f = stbi__fopen(filename, "rb"); + unsigned char * result; + if (!f) + return stbi__errpuc("can't fopen", "Unable to open file"); + result = stbi_load_from_file(f, x, y, comp, req_comp); + fclose(f); + return result; } -STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp) -{ - unsigned char *result; - stbi__context s; - stbi__start_file(&s,f); - result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp); - if (result) { - // need to 'unget' all the characters in the IO buffer - fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR); - } - return result; +STBIDEF stbi_uc * stbi_load_from_file(FILE * f, int * x, int * y, int * comp, int req_comp) { + unsigned char * result; + stbi__context s; + stbi__start_file(&s, f); + result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); + if (result) { + // need to 'unget' all the characters in the IO buffer + fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); + } + return result; } -STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp) -{ - stbi__uint16 *result; - stbi__context s; - stbi__start_file(&s,f); - result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp); - if (result) { - // need to 'unget' all the characters in the IO buffer - fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR); - } - return result; +STBIDEF stbi__uint16 * stbi_load_from_file_16(FILE * f, int * x, int * y, int * comp, int req_comp) { + stbi__uint16 * result; + stbi__context s; + stbi__start_file(&s, f); + result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp); + if (result) { + // need to 'unget' all the characters in the IO buffer + fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); + } + return result; } -STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp) -{ - FILE *f = stbi__fopen(filename, "rb"); - stbi__uint16 *result; - if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file"); - result = stbi_load_from_file_16(f,x,y,comp,req_comp); - fclose(f); - return result; +STBIDEF stbi_us * stbi_load_16(char const * filename, int * x, int * y, int * comp, int req_comp) { + FILE * f = stbi__fopen(filename, "rb"); + stbi__uint16 * result; + if (!f) + return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file"); + result = stbi_load_from_file_16(f, x, y, comp, req_comp); + fclose(f); + return result; } +#endif //! STBI_NO_STDIO -#endif //!STBI_NO_STDIO - -STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels) -{ - stbi__context s; - stbi__start_mem(&s,buffer,len); - return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels); +STBIDEF stbi_us * stbi_load_16_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file, + int desired_channels) { + stbi__context s; + stbi__start_mem(&s, buffer, len); + return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels); } -STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels) -{ - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); - return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels); +STBIDEF stbi_us * stbi_load_16_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, + int * channels_in_file, int desired_channels) { + stbi__context s; + stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); + return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels); } -STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp) -{ - stbi__context s; - stbi__start_mem(&s,buffer,len); - return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp); +STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp, int req_comp) { + stbi__context s; + stbi__start_mem(&s, buffer, len); + return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); } -STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp) -{ - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user); - return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp); +STBIDEF stbi_uc * stbi_load_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp, + int req_comp) { + stbi__context s; + stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); + return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); } #ifndef STBI_NO_GIF -STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp) -{ - unsigned char *result; - stbi__context s; - stbi__start_mem(&s,buffer,len); +STBIDEF stbi_uc * stbi_load_gif_from_memory(stbi_uc const * buffer, int len, int ** delays, int * x, int * y, int * z, + int * comp, int req_comp) { + unsigned char * result; + stbi__context s; + stbi__start_mem(&s, buffer, len); - result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); - if (stbi__vertically_flip_on_load) { - stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); - } + result = (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); + if (stbi__vertically_flip_on_load) { + stbi__vertical_flip_slices(result, *x, *y, *z, *comp); + } - return result; + return result; } #endif #ifndef STBI_NO_LINEAR -static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp) -{ - unsigned char *data; - #ifndef STBI_NO_HDR - if (stbi__hdr_test(s)) { - stbi__result_info ri; - float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri); - if (hdr_data) - stbi__float_postprocess(hdr_data,x,y,comp,req_comp); - return hdr_data; - } - #endif - data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp); - if (data) - return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp); - return stbi__errpf("unknown image type", "Image not of any known type, or corrupt"); +static float * stbi__loadf_main(stbi__context * s, int * x, int * y, int * comp, int req_comp) { + unsigned char * data; +#ifndef STBI_NO_HDR + if (stbi__hdr_test(s)) { + stbi__result_info ri; + float * hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri); + if (hdr_data) + stbi__float_postprocess(hdr_data, x, y, comp, req_comp); + return hdr_data; + } +#endif + data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp); + if (data) + return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp); + return stbi__errpf("unknown image type", "Image not of any known type, or corrupt"); } -STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp) -{ - stbi__context s; - stbi__start_mem(&s,buffer,len); - return stbi__loadf_main(&s,x,y,comp,req_comp); +STBIDEF float * stbi_loadf_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp, int req_comp) { + stbi__context s; + stbi__start_mem(&s, buffer, len); + return stbi__loadf_main(&s, x, y, comp, req_comp); } -STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp) -{ - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user); - return stbi__loadf_main(&s,x,y,comp,req_comp); +STBIDEF float * stbi_loadf_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp, + int req_comp) { + stbi__context s; + stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); + return stbi__loadf_main(&s, x, y, comp, req_comp); } #ifndef STBI_NO_STDIO -STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp) -{ - float *result; - FILE *f = stbi__fopen(filename, "rb"); - if (!f) return stbi__errpf("can't fopen", "Unable to open file"); - result = stbi_loadf_from_file(f,x,y,comp,req_comp); - fclose(f); - return result; +STBIDEF float * stbi_loadf(char const * filename, int * x, int * y, int * comp, int req_comp) { + float * result; + FILE * f = stbi__fopen(filename, "rb"); + if (!f) + return stbi__errpf("can't fopen", "Unable to open file"); + result = stbi_loadf_from_file(f, x, y, comp, req_comp); + fclose(f); + return result; } -STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp) -{ - stbi__context s; - stbi__start_file(&s,f); - return stbi__loadf_main(&s,x,y,comp,req_comp); +STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * comp, int req_comp) { + stbi__context s; + stbi__start_file(&s, f); + return stbi__loadf_main(&s, x, y, comp, req_comp); } #endif // !STBI_NO_STDIO @@ -1514,222 +1468,208 @@ STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_ // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always // reports false! -STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len) -{ - #ifndef STBI_NO_HDR - stbi__context s; - stbi__start_mem(&s,buffer,len); - return stbi__hdr_test(&s); - #else - STBI_NOTUSED(buffer); - STBI_NOTUSED(len); - return 0; - #endif +STBIDEF int stbi_is_hdr_from_memory(stbi_uc const * buffer, int len) { +#ifndef STBI_NO_HDR + stbi__context s; + stbi__start_mem(&s, buffer, len); + return stbi__hdr_test(&s); +#else + STBI_NOTUSED(buffer); + STBI_NOTUSED(len); + return 0; +#endif } #ifndef STBI_NO_STDIO -STBIDEF int stbi_is_hdr (char const *filename) -{ - FILE *f = stbi__fopen(filename, "rb"); - int result=0; - if (f) { - result = stbi_is_hdr_from_file(f); - fclose(f); - } - return result; +STBIDEF int stbi_is_hdr(char const * filename) { + FILE * f = stbi__fopen(filename, "rb"); + int result = 0; + if (f) { + result = stbi_is_hdr_from_file(f); + fclose(f); + } + return result; } -STBIDEF int stbi_is_hdr_from_file(FILE *f) -{ - #ifndef STBI_NO_HDR - long pos = ftell(f); - int res; - stbi__context s; - stbi__start_file(&s,f); - res = stbi__hdr_test(&s); - fseek(f, pos, SEEK_SET); - return res; - #else - STBI_NOTUSED(f); - return 0; - #endif +STBIDEF int stbi_is_hdr_from_file(FILE * f) { +#ifndef STBI_NO_HDR + long pos = ftell(f); + int res; + stbi__context s; + stbi__start_file(&s, f); + res = stbi__hdr_test(&s); + fseek(f, pos, SEEK_SET); + return res; +#else + STBI_NOTUSED(f); + return 0; +#endif } #endif // !STBI_NO_STDIO -STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user) -{ - #ifndef STBI_NO_HDR - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user); - return stbi__hdr_test(&s); - #else - STBI_NOTUSED(clbk); - STBI_NOTUSED(user); - return 0; - #endif +STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const * clbk, void * user) { +#ifndef STBI_NO_HDR + stbi__context s; + stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); + return stbi__hdr_test(&s); +#else + STBI_NOTUSED(clbk); + STBI_NOTUSED(user); + return 0; +#endif } #ifndef STBI_NO_LINEAR -static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f; +static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f; -STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; } -STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; } +STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; } +STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; } #endif -static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f; - -STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; } -STBIDEF void stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; } +static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f; +STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1 / gamma; } +STBIDEF void stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1 / scale; } ////////////////////////////////////////////////////////////////////////////// // // Common code used by all image loaders // -enum -{ - STBI__SCAN_load=0, - STBI__SCAN_type, - STBI__SCAN_header -}; +enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header }; -static void stbi__refill_buffer(stbi__context *s) -{ - int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen); - s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original); - if (n == 0) { - // at end of file, treat same as if from memory, but need to handle case - // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file - s->read_from_callbacks = 0; - s->img_buffer = s->buffer_start; - s->img_buffer_end = s->buffer_start+1; - *s->img_buffer = 0; - } else { - s->img_buffer = s->buffer_start; - s->img_buffer_end = s->buffer_start + n; - } +static void stbi__refill_buffer(stbi__context * s) { + int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen); + s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original); + if (n == 0) { + // at end of file, treat same as if from memory, but need to handle case + // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file + s->read_from_callbacks = 0; + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + 1; + *s->img_buffer = 0; + } else { + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + n; + } } -stbi_inline static stbi_uc stbi__get8(stbi__context *s) -{ - if (s->img_buffer < s->img_buffer_end) - return *s->img_buffer++; - if (s->read_from_callbacks) { - stbi__refill_buffer(s); - return *s->img_buffer++; - } - return 0; +stbi_inline static stbi_uc stbi__get8(stbi__context * s) { + if (s->img_buffer < s->img_buffer_end) + return *s->img_buffer++; + if (s->read_from_callbacks) { + stbi__refill_buffer(s); + return *s->img_buffer++; + } + return 0; } #if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) // nothing #else -stbi_inline static int stbi__at_eof(stbi__context *s) -{ - if (s->io.read) { - if (!(s->io.eof)(s->io_user_data)) return 0; - // if feof() is true, check if buffer = end - // special case: we've only got the special 0 character at the end - if (s->read_from_callbacks == 0) return 1; - } +stbi_inline static int stbi__at_eof(stbi__context * s) { + if (s->io.read) { + if (!(s->io.eof)(s->io_user_data)) + return 0; + // if feof() is true, check if buffer = end + // special case: we've only got the special 0 character at the end + if (s->read_from_callbacks == 0) + return 1; + } - return s->img_buffer >= s->img_buffer_end; + return s->img_buffer >= s->img_buffer_end; } #endif -#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) +#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && \ + defined(STBI_NO_GIF) && defined(STBI_NO_PIC) // nothing #else -static void stbi__skip(stbi__context *s, int n) -{ - if (n == 0) return; // already there! - if (n < 0) { - s->img_buffer = s->img_buffer_end; - return; - } - if (s->io.read) { - int blen = (int) (s->img_buffer_end - s->img_buffer); - if (blen < n) { - s->img_buffer = s->img_buffer_end; - (s->io.skip)(s->io_user_data, n - blen); - return; - } - } - s->img_buffer += n; +static void stbi__skip(stbi__context * s, int n) { + if (n == 0) + return; // already there! + if (n < 0) { + s->img_buffer = s->img_buffer_end; + return; + } + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + s->img_buffer = s->img_buffer_end; + (s->io.skip)(s->io_user_data, n - blen); + return; + } + } + s->img_buffer += n; } #endif #if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM) // nothing #else -static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n) -{ - if (s->io.read) { - int blen = (int) (s->img_buffer_end - s->img_buffer); - if (blen < n) { - int res, count; +static int stbi__getn(stbi__context * s, stbi_uc * buffer, int n) { + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + int res, count; - memcpy(buffer, s->img_buffer, blen); + memcpy(buffer, s->img_buffer, blen); - count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen); - res = (count == (n-blen)); - s->img_buffer = s->img_buffer_end; - return res; - } - } + count = (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen); + res = (count == (n - blen)); + s->img_buffer = s->img_buffer_end; + return res; + } + } - if (s->img_buffer+n <= s->img_buffer_end) { - memcpy(buffer, s->img_buffer, n); - s->img_buffer += n; - return 1; - } else - return 0; + if (s->img_buffer + n <= s->img_buffer_end) { + memcpy(buffer, s->img_buffer, n); + s->img_buffer += n; + return 1; + } else + return 0; } #endif #if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC) // nothing #else -static int stbi__get16be(stbi__context *s) -{ - int z = stbi__get8(s); - return (z << 8) + stbi__get8(s); +static int stbi__get16be(stbi__context * s) { + int z = stbi__get8(s); + return (z << 8) + stbi__get8(s); } #endif #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC) // nothing #else -static stbi__uint32 stbi__get32be(stbi__context *s) -{ - stbi__uint32 z = stbi__get16be(s); - return (z << 16) + stbi__get16be(s); +static stbi__uint32 stbi__get32be(stbi__context * s) { + stbi__uint32 z = stbi__get16be(s); + return (z << 16) + stbi__get16be(s); } #endif #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) // nothing #else -static int stbi__get16le(stbi__context *s) -{ - int z = stbi__get8(s); - return z + (stbi__get8(s) << 8); +static int stbi__get16le(stbi__context * s) { + int z = stbi__get8(s); + return z + (stbi__get8(s) << 8); } #endif #ifndef STBI_NO_BMP -static stbi__uint32 stbi__get32le(stbi__context *s) -{ - stbi__uint32 z = stbi__get16le(s); - z += (stbi__uint32)stbi__get16le(s) << 16; - return z; +static stbi__uint32 stbi__get32le(stbi__context * s) { + stbi__uint32 z = stbi__get16le(s); + z += (stbi__uint32)stbi__get16le(s) << 16; + return z; } #endif -#define STBI__BYTECAST(x) ((stbi_uc) ((x) & 255)) // truncate int to byte without warnings +#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings -#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) +#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && \ + defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) // nothing #else ////////////////////////////////////////////////////////////////////////////// @@ -1743,169 +1683,264 @@ static stbi__uint32 stbi__get32le(stbi__context *s) // assume data buffer is malloced, so malloc a new one and free that one // only failure mode is malloc failing -static stbi_uc stbi__compute_y(int r, int g, int b) -{ - return (stbi_uc) (((r*77) + (g*150) + (29*b)) >> 8); -} +static stbi_uc stbi__compute_y(int r, int g, int b) { return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); } #endif -#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) +#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && \ + defined(STBI_NO_PIC) && defined(STBI_NO_PNM) // nothing #else -static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y) -{ - int i,j; - unsigned char *good; +static unsigned char * stbi__convert_format(unsigned char * data, int img_n, int req_comp, unsigned int x, unsigned int y) { + int i, j; + unsigned char * good; - if (req_comp == img_n) return data; - STBI_ASSERT(req_comp >= 1 && req_comp <= 4); + if (req_comp == img_n) + return data; + STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0); - if (good == NULL) { - STBI_FREE(data); - return stbi__errpuc("outofmem", "Out of memory"); - } + good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) { + STBI_FREE(data); + return stbi__errpuc("outofmem", "Out of memory"); + } - for (j=0; j < (int) y; ++j) { - unsigned char *src = data + j * x * img_n ; - unsigned char *dest = good + j * x * req_comp; + for (j = 0; j < (int)y; ++j) { + unsigned char * src = data + j * x * img_n; + unsigned char * dest = good + j * x * req_comp; - #define STBI__COMBO(a,b) ((a)*8+(b)) - #define STBI__CASE(a,b) case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) - // convert source image with img_n components to one with req_comp components; - // avoid switch per pixel, so use switch per scanline and massive macros - switch (STBI__COMBO(img_n, req_comp)) { - STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255; } break; - STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0]; } break; - STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255; } break; - STBI__CASE(2,1) { dest[0]=src[0]; } break; - STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0]; } break; - STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1]; } break; - STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255; } break; - STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); } break; - STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255; } break; - STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); } break; - STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break; - STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2]; } break; - default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion"); - } - #undef STBI__CASE - } +#define STBI__COMBO(a, b) ((a)*8 + (b)) +#define STBI__CASE(a, b) \ + case STBI__COMBO(a, b): \ + for (i = x - 1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (STBI__COMBO(img_n, req_comp)) { + STBI__CASE(1, 2) { + dest[0] = src[0]; + dest[1] = 255; + } + break; + STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } + break; + STBI__CASE(1, 4) { + dest[0] = dest[1] = dest[2] = src[0]; + dest[3] = 255; + } + break; + STBI__CASE(2, 1) { dest[0] = src[0]; } + break; + STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } + break; + STBI__CASE(2, 4) { + dest[0] = dest[1] = dest[2] = src[0]; + dest[3] = src[1]; + } + break; + STBI__CASE(3, 4) { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest[3] = 255; + } + break; + STBI__CASE(3, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); } + break; + STBI__CASE(3, 2) { + dest[0] = stbi__compute_y(src[0], src[1], src[2]); + dest[1] = 255; + } + break; + STBI__CASE(4, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); } + break; + STBI__CASE(4, 2) { + dest[0] = stbi__compute_y(src[0], src[1], src[2]); + dest[1] = src[3]; + } + break; + STBI__CASE(4, 3) { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + } + break; + default: + STBI_ASSERT(0); + STBI_FREE(data); + STBI_FREE(good); + return stbi__errpuc("unsupported", "Unsupported format conversion"); + } +#undef STBI__CASE + } - STBI_FREE(data); - return good; + STBI_FREE(data); + return good; } #endif #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) // nothing #else -static stbi__uint16 stbi__compute_y_16(int r, int g, int b) -{ - return (stbi__uint16) (((r*77) + (g*150) + (29*b)) >> 8); -} +static stbi__uint16 stbi__compute_y_16(int r, int g, int b) { return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8); } #endif #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) // nothing #else -static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y) -{ - int i,j; - stbi__uint16 *good; +static stbi__uint16 * stbi__convert_format16(stbi__uint16 * data, int img_n, int req_comp, unsigned int x, unsigned int y) { + int i, j; + stbi__uint16 * good; - if (req_comp == img_n) return data; - STBI_ASSERT(req_comp >= 1 && req_comp <= 4); + if (req_comp == img_n) + return data; + STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2); - if (good == NULL) { - STBI_FREE(data); - return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory"); - } + good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2); + if (good == NULL) { + STBI_FREE(data); + return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory"); + } - for (j=0; j < (int) y; ++j) { - stbi__uint16 *src = data + j * x * img_n ; - stbi__uint16 *dest = good + j * x * req_comp; + for (j = 0; j < (int)y; ++j) { + stbi__uint16 * src = data + j * x * img_n; + stbi__uint16 * dest = good + j * x * req_comp; - #define STBI__COMBO(a,b) ((a)*8+(b)) - #define STBI__CASE(a,b) case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) - // convert source image with img_n components to one with req_comp components; - // avoid switch per pixel, so use switch per scanline and massive macros - switch (STBI__COMBO(img_n, req_comp)) { - STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff; } break; - STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0]; } break; - STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff; } break; - STBI__CASE(2,1) { dest[0]=src[0]; } break; - STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0]; } break; - STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1]; } break; - STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff; } break; - STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); } break; - STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break; - STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); } break; - STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break; - STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2]; } break; - default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion"); - } - #undef STBI__CASE - } +#define STBI__COMBO(a, b) ((a)*8 + (b)) +#define STBI__CASE(a, b) \ + case STBI__COMBO(a, b): \ + for (i = x - 1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (STBI__COMBO(img_n, req_comp)) { + STBI__CASE(1, 2) { + dest[0] = src[0]; + dest[1] = 0xffff; + } + break; + STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } + break; + STBI__CASE(1, 4) { + dest[0] = dest[1] = dest[2] = src[0]; + dest[3] = 0xffff; + } + break; + STBI__CASE(2, 1) { dest[0] = src[0]; } + break; + STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } + break; + STBI__CASE(2, 4) { + dest[0] = dest[1] = dest[2] = src[0]; + dest[3] = src[1]; + } + break; + STBI__CASE(3, 4) { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest[3] = 0xffff; + } + break; + STBI__CASE(3, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); } + break; + STBI__CASE(3, 2) { + dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); + dest[1] = 0xffff; + } + break; + STBI__CASE(4, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); } + break; + STBI__CASE(4, 2) { + dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); + dest[1] = src[3]; + } + break; + STBI__CASE(4, 3) { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + } + break; + default: + STBI_ASSERT(0); + STBI_FREE(data); + STBI_FREE(good); + return (stbi__uint16 *)stbi__errpuc("unsupported", "Unsupported format conversion"); + } +#undef STBI__CASE + } - STBI_FREE(data); - return good; + STBI_FREE(data); + return good; } #endif #ifndef STBI_NO_LINEAR -static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp) -{ - int i,k,n; - float *output; - if (!data) return NULL; - output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0); - if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); } - // compute number of non-alpha components - if (comp & 1) n = comp; else n = comp-1; - for (i=0; i < x*y; ++i) { - for (k=0; k < n; ++k) { - output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale); - } - } - if (n < comp) { - for (i=0; i < x*y; ++i) { - output[i*comp + n] = data[i*comp + n]/255.0f; - } - } - STBI_FREE(data); - return output; +static float * stbi__ldr_to_hdr(stbi_uc * data, int x, int y, int comp) { + int i, k, n; + float * output; + if (!data) + return NULL; + output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0); + if (output == NULL) { + STBI_FREE(data); + return stbi__errpf("outofmem", "Out of memory"); + } + // compute number of non-alpha components + if (comp & 1) + n = comp; + else + n = comp - 1; + for (i = 0; i < x * y; ++i) { + for (k = 0; k < n; ++k) { + output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale); + } + } + if (n < comp) { + for (i = 0; i < x * y; ++i) { + output[i * comp + n] = data[i * comp + n] / 255.0f; + } + } + STBI_FREE(data); + return output; } #endif #ifndef STBI_NO_HDR -#define stbi__float2int(x) ((int) (x)) -static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp) -{ - int i,k,n; - stbi_uc *output; - if (!data) return NULL; - output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0); - if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); } - // compute number of non-alpha components - if (comp & 1) n = comp; else n = comp-1; - for (i=0; i < x*y; ++i) { - for (k=0; k < n; ++k) { - float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f; - if (z < 0) z = 0; - if (z > 255) z = 255; - output[i*comp + k] = (stbi_uc) stbi__float2int(z); - } - if (k < comp) { - float z = data[i*comp+k] * 255 + 0.5f; - if (z < 0) z = 0; - if (z > 255) z = 255; - output[i*comp + k] = (stbi_uc) stbi__float2int(z); - } - } - STBI_FREE(data); - return output; +#define stbi__float2int(x) ((int)(x)) +static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp) { + int i, k, n; + stbi_uc * output; + if (!data) + return NULL; + output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0); + if (output == NULL) { + STBI_FREE(data); + return stbi__errpuc("outofmem", "Out of memory"); + } + // compute number of non-alpha components + if (comp & 1) + n = comp; + else + n = comp - 1; + for (i = 0; i < x * y; ++i) { + for (k = 0; k < n; ++k) { + float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f; + if (z < 0) + z = 0; + if (z > 255) + z = 255; + output[i * comp + k] = (stbi_uc)stbi__float2int(z); + } + if (k < comp) { + float z = data[i * comp + k] * 255 + 0.5f; + if (z < 0) + z = 0; + if (z > 255) + z = 255; + output[i * comp + k] = (stbi_uc)stbi__float2int(z); + } + } + STBI_FREE(data); + return output; } #endif @@ -1933,763 +1968,783 @@ static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp) #ifndef STBI_NO_JPEG // huffman decoding acceleration -#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache +#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache -typedef struct -{ - stbi_uc fast[1 << FAST_BITS]; - // weirdly, repacking this into AoS is a 10% speed loss, instead of a win - stbi__uint16 code[256]; - stbi_uc values[256]; - stbi_uc size[257]; - unsigned int maxcode[18]; - int delta[17]; // old 'firstsymbol' - old 'firstcode' +typedef struct { + stbi_uc fast[1 << FAST_BITS]; + // weirdly, repacking this into AoS is a 10% speed loss, instead of a win + stbi__uint16 code[256]; + stbi_uc values[256]; + stbi_uc size[257]; + unsigned int maxcode[18]; + int delta[17]; // old 'firstsymbol' - old 'firstcode' } stbi__huffman; -typedef struct -{ - stbi__context *s; - stbi__huffman huff_dc[4]; - stbi__huffman huff_ac[4]; - stbi__uint16 dequant[4][64]; - stbi__int16 fast_ac[4][1 << FAST_BITS]; +typedef struct { + stbi__context * s; + stbi__huffman huff_dc[4]; + stbi__huffman huff_ac[4]; + stbi__uint16 dequant[4][64]; + stbi__int16 fast_ac[4][1 << FAST_BITS]; -// sizes for components, interleaved MCUs - int img_h_max, img_v_max; - int img_mcu_x, img_mcu_y; - int img_mcu_w, img_mcu_h; + // sizes for components, interleaved MCUs + int img_h_max, img_v_max; + int img_mcu_x, img_mcu_y; + int img_mcu_w, img_mcu_h; -// definition of jpeg image component - struct - { - int id; - int h,v; - int tq; - int hd,ha; - int dc_pred; + // definition of jpeg image component + struct { + int id; + int h, v; + int tq; + int hd, ha; + int dc_pred; - int x,y,w2,h2; - stbi_uc *data; - void *raw_data, *raw_coeff; - stbi_uc *linebuf; - short *coeff; // progressive only - int coeff_w, coeff_h; // number of 8x8 coefficient blocks - } img_comp[4]; + int x, y, w2, h2; + stbi_uc * data; + void *raw_data, *raw_coeff; + stbi_uc * linebuf; + short * coeff; // progressive only + int coeff_w, coeff_h; // number of 8x8 coefficient blocks + } img_comp[4]; - stbi__uint32 code_buffer; // jpeg entropy-coded buffer - int code_bits; // number of valid bits - unsigned char marker; // marker seen while filling entropy buffer - int nomore; // flag if we saw a marker so must stop + stbi__uint32 code_buffer; // jpeg entropy-coded buffer + int code_bits; // number of valid bits + unsigned char marker; // marker seen while filling entropy buffer + int nomore; // flag if we saw a marker so must stop - int progressive; - int spec_start; - int spec_end; - int succ_high; - int succ_low; - int eob_run; - int jfif; - int app14_color_transform; // Adobe APP14 tag - int rgb; + int progressive; + int spec_start; + int spec_end; + int succ_high; + int succ_low; + int eob_run; + int jfif; + int app14_color_transform; // Adobe APP14 tag + int rgb; - int scan_n, order[4]; - int restart_interval, todo; + int scan_n, order[4]; + int restart_interval, todo; -// kernels - void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]); - void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step); - stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs); + // kernels + void (*idct_block_kernel)(stbi_uc * out, int out_stride, short data[64]); + void (*YCbCr_to_RGB_kernel)(stbi_uc * out, const stbi_uc * y, const stbi_uc * pcb, const stbi_uc * pcr, int count, + int step); + stbi_uc * (*resample_row_hv_2_kernel)(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs); } stbi__jpeg; -static int stbi__build_huffman(stbi__huffman *h, int *count) -{ - int i,j,k=0; - unsigned int code; - // build size list for each symbol (from JPEG spec) - for (i=0; i < 16; ++i) { - for (j=0; j < count[i]; ++j) { - h->size[k++] = (stbi_uc) (i+1); - if(k >= 257) return stbi__err("bad size list","Corrupt JPEG"); - } - } - h->size[k] = 0; +static int stbi__build_huffman(stbi__huffman * h, int * count) { + int i, j, k = 0; + unsigned int code; + // build size list for each symbol (from JPEG spec) + for (i = 0; i < 16; ++i) { + for (j = 0; j < count[i]; ++j) { + h->size[k++] = (stbi_uc)(i + 1); + if (k >= 257) + return stbi__err("bad size list", "Corrupt JPEG"); + } + } + h->size[k] = 0; - // compute actual symbols (from jpeg spec) - code = 0; - k = 0; - for(j=1; j <= 16; ++j) { - // compute delta to add to code to compute symbol id - h->delta[j] = k - code; - if (h->size[k] == j) { - while (h->size[k] == j) - h->code[k++] = (stbi__uint16) (code++); - if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG"); - } - // compute largest code + 1 for this size, preshifted as needed later - h->maxcode[j] = code << (16-j); - code <<= 1; - } - h->maxcode[j] = 0xffffffff; + // compute actual symbols (from jpeg spec) + code = 0; + k = 0; + for (j = 1; j <= 16; ++j) { + // compute delta to add to code to compute symbol id + h->delta[j] = k - code; + if (h->size[k] == j) { + while (h->size[k] == j) + h->code[k++] = (stbi__uint16)(code++); + if (code - 1 >= (1u << j)) + return stbi__err("bad code lengths", "Corrupt JPEG"); + } + // compute largest code + 1 for this size, preshifted as needed later + h->maxcode[j] = code << (16 - j); + code <<= 1; + } + h->maxcode[j] = 0xffffffff; - // build non-spec acceleration table; 255 is flag for not-accelerated - memset(h->fast, 255, 1 << FAST_BITS); - for (i=0; i < k; ++i) { - int s = h->size[i]; - if (s <= FAST_BITS) { - int c = h->code[i] << (FAST_BITS-s); - int m = 1 << (FAST_BITS-s); - for (j=0; j < m; ++j) { - h->fast[c+j] = (stbi_uc) i; - } - } - } - return 1; + // build non-spec acceleration table; 255 is flag for not-accelerated + memset(h->fast, 255, 1 << FAST_BITS); + for (i = 0; i < k; ++i) { + int s = h->size[i]; + if (s <= FAST_BITS) { + int c = h->code[i] << (FAST_BITS - s); + int m = 1 << (FAST_BITS - s); + for (j = 0; j < m; ++j) { + h->fast[c + j] = (stbi_uc)i; + } + } + } + return 1; } // build a table that decodes both magnitude and value of small ACs in // one go. -static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h) -{ - int i; - for (i=0; i < (1 << FAST_BITS); ++i) { - stbi_uc fast = h->fast[i]; - fast_ac[i] = 0; - if (fast < 255) { - int rs = h->values[fast]; - int run = (rs >> 4) & 15; - int magbits = rs & 15; - int len = h->size[fast]; +static void stbi__build_fast_ac(stbi__int16 * fast_ac, stbi__huffman * h) { + int i; + for (i = 0; i < (1 << FAST_BITS); ++i) { + stbi_uc fast = h->fast[i]; + fast_ac[i] = 0; + if (fast < 255) { + int rs = h->values[fast]; + int run = (rs >> 4) & 15; + int magbits = rs & 15; + int len = h->size[fast]; - if (magbits && len + magbits <= FAST_BITS) { - // magnitude code followed by receive_extend code - int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); - int m = 1 << (magbits - 1); - if (k < m) k += (~0U << magbits) + 1; - // if the result is small enough, we can fit it in fast_ac table - if (k >= -128 && k <= 127) - fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits)); - } - } - } + if (magbits && len + magbits <= FAST_BITS) { + // magnitude code followed by receive_extend code + int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); + int m = 1 << (magbits - 1); + if (k < m) + k += (~0U << magbits) + 1; + // if the result is small enough, we can fit it in fast_ac table + if (k >= -128 && k <= 127) + fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits)); + } + } + } } -static void stbi__grow_buffer_unsafe(stbi__jpeg *j) -{ - do { - unsigned int b = j->nomore ? 0 : stbi__get8(j->s); - if (b == 0xff) { - int c = stbi__get8(j->s); - while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes - if (c != 0) { - j->marker = (unsigned char) c; - j->nomore = 1; - return; - } - } - j->code_buffer |= b << (24 - j->code_bits); - j->code_bits += 8; - } while (j->code_bits <= 24); +static void stbi__grow_buffer_unsafe(stbi__jpeg * j) { + do { + unsigned int b = j->nomore ? 0 : stbi__get8(j->s); + if (b == 0xff) { + int c = stbi__get8(j->s); + while (c == 0xff) + c = stbi__get8(j->s); // consume fill bytes + if (c != 0) { + j->marker = (unsigned char)c; + j->nomore = 1; + return; + } + } + j->code_buffer |= b << (24 - j->code_bits); + j->code_bits += 8; + } while (j->code_bits <= 24); } // (1 << n) - 1 -static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535}; +static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255, + 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535}; // decode a jpeg huffman value from the bitstream -stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) -{ - unsigned int temp; - int c,k; +stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg * j, stbi__huffman * h) { + unsigned int temp; + int c, k; - if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); + if (j->code_bits < 16) + stbi__grow_buffer_unsafe(j); - // look at the top FAST_BITS and determine what symbol ID it is, - // if the code is <= FAST_BITS - c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); - k = h->fast[c]; - if (k < 255) { - int s = h->size[k]; - if (s > j->code_bits) - return -1; - j->code_buffer <<= s; - j->code_bits -= s; - return h->values[k]; - } + // look at the top FAST_BITS and determine what symbol ID it is, + // if the code is <= FAST_BITS + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + k = h->fast[c]; + if (k < 255) { + int s = h->size[k]; + if (s > j->code_bits) + return -1; + j->code_buffer <<= s; + j->code_bits -= s; + return h->values[k]; + } - // naive test is to shift the code_buffer down so k bits are - // valid, then test against maxcode. To speed this up, we've - // preshifted maxcode left so that it has (16-k) 0s at the - // end; in other words, regardless of the number of bits, it - // wants to be compared against something shifted to have 16; - // that way we don't need to shift inside the loop. - temp = j->code_buffer >> 16; - for (k=FAST_BITS+1 ; ; ++k) - if (temp < h->maxcode[k]) - break; - if (k == 17) { - // error! code not found - j->code_bits -= 16; - return -1; - } + // naive test is to shift the code_buffer down so k bits are + // valid, then test against maxcode. To speed this up, we've + // preshifted maxcode left so that it has (16-k) 0s at the + // end; in other words, regardless of the number of bits, it + // wants to be compared against something shifted to have 16; + // that way we don't need to shift inside the loop. + temp = j->code_buffer >> 16; + for (k = FAST_BITS + 1;; ++k) + if (temp < h->maxcode[k]) + break; + if (k == 17) { + // error! code not found + j->code_bits -= 16; + return -1; + } - if (k > j->code_bits) - return -1; + if (k > j->code_bits) + return -1; - // convert the huffman code to the symbol id - c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k]; - if(c < 0 || c >= 256) // symbol id out of bounds! - return -1; - STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]); + // convert the huffman code to the symbol id + c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k]; + if (c < 0 || c >= 256) // symbol id out of bounds! + return -1; + STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]); - // convert the id to a symbol - j->code_bits -= k; - j->code_buffer <<= k; - return h->values[c]; + // convert the id to a symbol + j->code_bits -= k; + j->code_buffer <<= k; + return h->values[c]; } // bias[n] = (-1<code_bits < n) stbi__grow_buffer_unsafe(j); - if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing +stbi_inline static int stbi__extend_receive(stbi__jpeg * j, int n) { + unsigned int k; + int sgn; + if (j->code_bits < n) + stbi__grow_buffer_unsafe(j); + if (j->code_bits < n) + return 0; // ran out of bits from stream, return 0s intead of continuing - sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative) - k = stbi_lrot(j->code_buffer, n); - j->code_buffer = k & ~stbi__bmask[n]; - k &= stbi__bmask[n]; - j->code_bits -= n; - return k + (stbi__jbias[n] & (sgn - 1)); + sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative) + k = stbi_lrot(j->code_buffer, n); + j->code_buffer = k & ~stbi__bmask[n]; + k &= stbi__bmask[n]; + j->code_bits -= n; + return k + (stbi__jbias[n] & (sgn - 1)); } // get some unsigned bits -stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n) -{ - unsigned int k; - if (j->code_bits < n) stbi__grow_buffer_unsafe(j); - if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing - k = stbi_lrot(j->code_buffer, n); - j->code_buffer = k & ~stbi__bmask[n]; - k &= stbi__bmask[n]; - j->code_bits -= n; - return k; +stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg * j, int n) { + unsigned int k; + if (j->code_bits < n) + stbi__grow_buffer_unsafe(j); + if (j->code_bits < n) + return 0; // ran out of bits from stream, return 0s intead of continuing + k = stbi_lrot(j->code_buffer, n); + j->code_buffer = k & ~stbi__bmask[n]; + k &= stbi__bmask[n]; + j->code_bits -= n; + return k; } -stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j) -{ - unsigned int k; - if (j->code_bits < 1) stbi__grow_buffer_unsafe(j); - if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing - k = j->code_buffer; - j->code_buffer <<= 1; - --j->code_bits; - return k & 0x80000000; +stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg * j) { + unsigned int k; + if (j->code_bits < 1) + stbi__grow_buffer_unsafe(j); + if (j->code_bits < 1) + return 0; // ran out of bits from stream, return 0s intead of continuing + k = j->code_buffer; + j->code_buffer <<= 1; + --j->code_bits; + return k & 0x80000000; } // given a value that's at position X in the zigzag stream, // where does it appear in the 8x8 matrix coded as row-major? -static const stbi_uc stbi__jpeg_dezigzag[64+15] = -{ - 0, 1, 8, 16, 9, 2, 3, 10, - 17, 24, 32, 25, 18, 11, 4, 5, - 12, 19, 26, 33, 40, 48, 41, 34, - 27, 20, 13, 6, 7, 14, 21, 28, - 35, 42, 49, 56, 57, 50, 43, 36, - 29, 22, 15, 23, 30, 37, 44, 51, - 58, 59, 52, 45, 38, 31, 39, 46, - 53, 60, 61, 54, 47, 55, 62, 63, - // let corrupt input sample past end - 63, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 63 -}; +static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = { + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, + 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, + // let corrupt input sample past end + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63}; // decode one 64-entry block-- -static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant) -{ - int diff,dc,k; - int t; +static int stbi__jpeg_decode_block(stbi__jpeg * j, short data[64], stbi__huffman * hdc, stbi__huffman * hac, stbi__int16 * fac, + int b, stbi__uint16 * dequant) { + int diff, dc, k; + int t; - if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); - t = stbi__jpeg_huff_decode(j, hdc); - if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG"); + if (j->code_bits < 16) + stbi__grow_buffer_unsafe(j); + t = stbi__jpeg_huff_decode(j, hdc); + if (t < 0 || t > 15) + return stbi__err("bad huffman code", "Corrupt JPEG"); - // 0 all the ac values now so we can do it 32-bits at a time - memset(data,0,64*sizeof(data[0])); + // 0 all the ac values now so we can do it 32-bits at a time + memset(data, 0, 64 * sizeof(data[0])); - diff = t ? stbi__extend_receive(j, t) : 0; - if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG"); - dc = j->img_comp[b].dc_pred + diff; - j->img_comp[b].dc_pred = dc; - if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - data[0] = (short) (dc * dequant[0]); + diff = t ? stbi__extend_receive(j, t) : 0; + if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) + return stbi__err("bad delta", "Corrupt JPEG"); + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + if (!stbi__mul2shorts_valid(dc, dequant[0])) + return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + data[0] = (short)(dc * dequant[0]); - // decode AC components, see JPEG spec - k = 1; - do { - unsigned int zig; - int c,r,s; - if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); - c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); - r = fac[c]; - if (r) { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length - if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available"); - j->code_buffer <<= s; - j->code_bits -= s; - // decode into unzigzag'd location - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) ((r >> 8) * dequant[zig]); - } else { - int rs = stbi__jpeg_huff_decode(j, hac); - if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); - s = rs & 15; - r = rs >> 4; - if (s == 0) { - if (rs != 0xf0) break; // end block - k += 16; - } else { - k += r; + // decode AC components, see JPEG spec + k = 1; + do { + unsigned int zig; + int c, r, s; + if (j->code_bits < 16) + stbi__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + if (s > j->code_bits) + return stbi__err("bad huffman code", "Combined length longer than code bits available"); + j->code_buffer <<= s; + j->code_bits -= s; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]); - } - } - } while (k < 64); - return 1; + data[zig] = (short)((r >> 8) * dequant[zig]); + } else { + int rs = stbi__jpeg_huff_decode(j, hac); + if (rs < 0) + return stbi__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (rs != 0xf0) + break; // end block + k += 16; + } else { + k += r; + // decode into unzigzag'd location + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]); + } + } + } while (k < 64); + return 1; } -static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b) -{ - int diff,dc; - int t; - if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); +static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg * j, short data[64], stbi__huffman * hdc, int b) { + int diff, dc; + int t; + if (j->spec_end != 0) + return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); + if (j->code_bits < 16) + stbi__grow_buffer_unsafe(j); - if (j->succ_high == 0) { - // first scan for DC coefficient, must be first - memset(data,0,64*sizeof(data[0])); // 0 all the ac values now - t = stbi__jpeg_huff_decode(j, hdc); - if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - diff = t ? stbi__extend_receive(j, t) : 0; + if (j->succ_high == 0) { + // first scan for DC coefficient, must be first + memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now + t = stbi__jpeg_huff_decode(j, hdc); + if (t < 0 || t > 15) + return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + diff = t ? stbi__extend_receive(j, t) : 0; - if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG"); - dc = j->img_comp[b].dc_pred + diff; - j->img_comp[b].dc_pred = dc; - if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - data[0] = (short) (dc * (1 << j->succ_low)); - } else { - // refinement scan for DC coefficient - if (stbi__jpeg_get_bit(j)) - data[0] += (short) (1 << j->succ_low); - } - return 1; + if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) + return stbi__err("bad delta", "Corrupt JPEG"); + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) + return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + data[0] = (short)(dc * (1 << j->succ_low)); + } else { + // refinement scan for DC coefficient + if (stbi__jpeg_get_bit(j)) + data[0] += (short)(1 << j->succ_low); + } + return 1; } // @OPTIMIZE: store non-zigzagged during the decode passes, // and only de-zigzag when dequantizing -static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac) -{ - int k; - if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); +static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg * j, short data[64], stbi__huffman * hac, stbi__int16 * fac) { + int k; + if (j->spec_start == 0) + return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - if (j->succ_high == 0) { - int shift = j->succ_low; + if (j->succ_high == 0) { + int shift = j->succ_low; - if (j->eob_run) { - --j->eob_run; - return 1; - } + if (j->eob_run) { + --j->eob_run; + return 1; + } - k = j->spec_start; - do { - unsigned int zig; - int c,r,s; - if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); - c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); - r = fac[c]; - if (r) { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length - if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available"); - j->code_buffer <<= s; - j->code_bits -= s; - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) ((r >> 8) * (1 << shift)); - } else { - int rs = stbi__jpeg_huff_decode(j, hac); - if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); - s = rs & 15; - r = rs >> 4; - if (s == 0) { - if (r < 15) { - j->eob_run = (1 << r); - if (r) - j->eob_run += stbi__jpeg_get_bits(j, r); - --j->eob_run; - break; - } - k += 16; + k = j->spec_start; + do { + unsigned int zig; + int c, r, s; + if (j->code_bits < 16) + stbi__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + if (s > j->code_bits) + return stbi__err("bad huffman code", "Combined length longer than code bits available"); + j->code_buffer <<= s; + j->code_bits -= s; + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short)((r >> 8) * (1 << shift)); } else { - k += r; - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift)); + int rs = stbi__jpeg_huff_decode(j, hac); + if (rs < 0) + return stbi__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r); + if (r) + j->eob_run += stbi__jpeg_get_bits(j, r); + --j->eob_run; + break; + } + k += 16; + } else { + k += r; + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short)(stbi__extend_receive(j, s) * (1 << shift)); + } } - } - } while (k <= j->spec_end); - } else { - // refinement scan for these AC coefficients + } while (k <= j->spec_end); + } else { + // refinement scan for these AC coefficients - short bit = (short) (1 << j->succ_low); + short bit = (short)(1 << j->succ_low); - if (j->eob_run) { - --j->eob_run; - for (k = j->spec_start; k <= j->spec_end; ++k) { - short *p = &data[stbi__jpeg_dezigzag[k]]; - if (*p != 0) - if (stbi__jpeg_get_bit(j)) - if ((*p & bit)==0) { - if (*p > 0) - *p += bit; - else - *p -= bit; - } - } - } else { - k = j->spec_start; - do { - int r,s; - int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh - if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); - s = rs & 15; - r = rs >> 4; - if (s == 0) { - if (r < 15) { - j->eob_run = (1 << r) - 1; - if (r) - j->eob_run += stbi__jpeg_get_bits(j, r); - r = 64; // force end of block - } else { - // r=15 s=0 should write 16 0s, so we just do - // a run of 15 0s and then write s (which is 0), - // so we don't have to do anything special here - } - } else { - if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG"); - // sign bit - if (stbi__jpeg_get_bit(j)) - s = bit; - else - s = -bit; + if (j->eob_run) { + --j->eob_run; + for (k = j->spec_start; k <= j->spec_end; ++k) { + short * p = &data[stbi__jpeg_dezigzag[k]]; + if (*p != 0) + if (stbi__jpeg_get_bit(j)) + if ((*p & bit) == 0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } } + } else { + k = j->spec_start; + do { + int r, s; + int rs = stbi__jpeg_huff_decode( + j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh + if (rs < 0) + return stbi__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r) - 1; + if (r) + j->eob_run += stbi__jpeg_get_bits(j, r); + r = 64; // force end of block + } else { + // r=15 s=0 should write 16 0s, so we just do + // a run of 15 0s and then write s (which is 0), + // so we don't have to do anything special here + } + } else { + if (s != 1) + return stbi__err("bad huffman code", "Corrupt JPEG"); + // sign bit + if (stbi__jpeg_get_bit(j)) + s = bit; + else + s = -bit; + } - // advance by r - while (k <= j->spec_end) { - short *p = &data[stbi__jpeg_dezigzag[k++]]; - if (*p != 0) { - if (stbi__jpeg_get_bit(j)) - if ((*p & bit)==0) { - if (*p > 0) - *p += bit; - else - *p -= bit; - } - } else { - if (r == 0) { - *p = (short) s; - break; - } - --r; - } - } - } while (k <= j->spec_end); - } - } - return 1; + // advance by r + while (k <= j->spec_end) { + short * p = &data[stbi__jpeg_dezigzag[k++]]; + if (*p != 0) { + if (stbi__jpeg_get_bit(j)) + if ((*p & bit) == 0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } else { + if (r == 0) { + *p = (short)s; + break; + } + --r; + } + } + } while (k <= j->spec_end); + } + } + return 1; } // take a -128..127 value and stbi__clamp it and convert to 0..255 -stbi_inline static stbi_uc stbi__clamp(int x) -{ - // trick to use a single test to catch both cases - if ((unsigned int) x > 255) { - if (x < 0) return 0; - if (x > 255) return 255; - } - return (stbi_uc) x; +stbi_inline static stbi_uc stbi__clamp(int x) { + // trick to use a single test to catch both cases + if ((unsigned int)x > 255) { + if (x < 0) + return 0; + if (x > 255) + return 255; + } + return (stbi_uc)x; } -#define stbi__f2f(x) ((int) (((x) * 4096 + 0.5))) -#define stbi__fsh(x) ((x) * 4096) +#define stbi__f2f(x) ((int)(((x)*4096 + 0.5))) +#define stbi__fsh(x) ((x)*4096) // derived from jidctint -- DCT_ISLOW -#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \ - int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \ - p2 = s2; \ - p3 = s6; \ - p1 = (p2+p3) * stbi__f2f(0.5411961f); \ - t2 = p1 + p3*stbi__f2f(-1.847759065f); \ - t3 = p1 + p2*stbi__f2f( 0.765366865f); \ - p2 = s0; \ - p3 = s4; \ - t0 = stbi__fsh(p2+p3); \ - t1 = stbi__fsh(p2-p3); \ - x0 = t0+t3; \ - x3 = t0-t3; \ - x1 = t1+t2; \ - x2 = t1-t2; \ - t0 = s7; \ - t1 = s5; \ - t2 = s3; \ - t3 = s1; \ - p3 = t0+t2; \ - p4 = t1+t3; \ - p1 = t0+t3; \ - p2 = t1+t2; \ - p5 = (p3+p4)*stbi__f2f( 1.175875602f); \ - t0 = t0*stbi__f2f( 0.298631336f); \ - t1 = t1*stbi__f2f( 2.053119869f); \ - t2 = t2*stbi__f2f( 3.072711026f); \ - t3 = t3*stbi__f2f( 1.501321110f); \ - p1 = p5 + p1*stbi__f2f(-0.899976223f); \ - p2 = p5 + p2*stbi__f2f(-2.562915447f); \ - p3 = p3*stbi__f2f(-1.961570560f); \ - p4 = p4*stbi__f2f(-0.390180644f); \ - t3 += p1+p4; \ - t2 += p2+p3; \ - t1 += p2+p4; \ - t0 += p1+p3; +#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \ + int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3; \ + p2 = s2; \ + p3 = s6; \ + p1 = (p2 + p3) * stbi__f2f(0.5411961f); \ + t2 = p1 + p3 * stbi__f2f(-1.847759065f); \ + t3 = p1 + p2 * stbi__f2f(0.765366865f); \ + p2 = s0; \ + p3 = s4; \ + t0 = stbi__fsh(p2 + p3); \ + t1 = stbi__fsh(p2 - p3); \ + x0 = t0 + t3; \ + x3 = t0 - t3; \ + x1 = t1 + t2; \ + x2 = t1 - t2; \ + t0 = s7; \ + t1 = s5; \ + t2 = s3; \ + t3 = s1; \ + p3 = t0 + t2; \ + p4 = t1 + t3; \ + p1 = t0 + t3; \ + p2 = t1 + t2; \ + p5 = (p3 + p4) * stbi__f2f(1.175875602f); \ + t0 = t0 * stbi__f2f(0.298631336f); \ + t1 = t1 * stbi__f2f(2.053119869f); \ + t2 = t2 * stbi__f2f(3.072711026f); \ + t3 = t3 * stbi__f2f(1.501321110f); \ + p1 = p5 + p1 * stbi__f2f(-0.899976223f); \ + p2 = p5 + p2 * stbi__f2f(-2.562915447f); \ + p3 = p3 * stbi__f2f(-1.961570560f); \ + p4 = p4 * stbi__f2f(-0.390180644f); \ + t3 += p1 + p4; \ + t2 += p2 + p3; \ + t1 += p2 + p4; \ + t0 += p1 + p3; -static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) -{ - int i,val[64],*v=val; - stbi_uc *o; - short *d = data; +static void stbi__idct_block(stbi_uc * out, int out_stride, short data[64]) { + int i, val[64], *v = val; + stbi_uc * o; + short * d = data; - // columns - for (i=0; i < 8; ++i,++d, ++v) { - // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing - if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0 - && d[40]==0 && d[48]==0 && d[56]==0) { - // no shortcut 0 seconds - // (1|2|3|4|5|6|7)==0 0 seconds - // all separate -0.047 seconds - // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds - int dcterm = d[0]*4; - v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; - } else { - STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]) - // constants scaled things up by 1<<12; let's bring them back - // down, but keep 2 extra bits of precision - x0 += 512; x1 += 512; x2 += 512; x3 += 512; - v[ 0] = (x0+t3) >> 10; - v[56] = (x0-t3) >> 10; - v[ 8] = (x1+t2) >> 10; - v[48] = (x1-t2) >> 10; - v[16] = (x2+t1) >> 10; - v[40] = (x2-t1) >> 10; - v[24] = (x3+t0) >> 10; - v[32] = (x3-t0) >> 10; - } - } + // columns + for (i = 0; i < 8; ++i, ++d, ++v) { + // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing + if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) { + // no shortcut 0 seconds + // (1|2|3|4|5|6|7)==0 0 seconds + // all separate -0.047 seconds + // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds + int dcterm = d[0] * 4; + v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; + } else { + STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56]) + // constants scaled things up by 1<<12; let's bring them back + // down, but keep 2 extra bits of precision + x0 += 512; + x1 += 512; + x2 += 512; + x3 += 512; + v[0] = (x0 + t3) >> 10; + v[56] = (x0 - t3) >> 10; + v[8] = (x1 + t2) >> 10; + v[48] = (x1 - t2) >> 10; + v[16] = (x2 + t1) >> 10; + v[40] = (x2 - t1) >> 10; + v[24] = (x3 + t0) >> 10; + v[32] = (x3 - t0) >> 10; + } + } - for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) { - // no fast case since the first 1D IDCT spread components out - STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]) - // constants scaled things up by 1<<12, plus we had 1<<2 from first - // loop, plus horizontal and vertical each scale by sqrt(8) so together - // we've got an extra 1<<3, so 1<<17 total we need to remove. - // so we want to round that, which means adding 0.5 * 1<<17, - // aka 65536. Also, we'll end up with -128 to 127 that we want - // to encode as 0..255 by adding 128, so we'll add that before the shift - x0 += 65536 + (128<<17); - x1 += 65536 + (128<<17); - x2 += 65536 + (128<<17); - x3 += 65536 + (128<<17); - // tried computing the shifts into temps, or'ing the temps to see - // if any were out of range, but that was slower - o[0] = stbi__clamp((x0+t3) >> 17); - o[7] = stbi__clamp((x0-t3) >> 17); - o[1] = stbi__clamp((x1+t2) >> 17); - o[6] = stbi__clamp((x1-t2) >> 17); - o[2] = stbi__clamp((x2+t1) >> 17); - o[5] = stbi__clamp((x2-t1) >> 17); - o[3] = stbi__clamp((x3+t0) >> 17); - o[4] = stbi__clamp((x3-t0) >> 17); - } + for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) { + // no fast case since the first 1D IDCT spread components out + STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]) + // constants scaled things up by 1<<12, plus we had 1<<2 from first + // loop, plus horizontal and vertical each scale by sqrt(8) so together + // we've got an extra 1<<3, so 1<<17 total we need to remove. + // so we want to round that, which means adding 0.5 * 1<<17, + // aka 65536. Also, we'll end up with -128 to 127 that we want + // to encode as 0..255 by adding 128, so we'll add that before the shift + x0 += 65536 + (128 << 17); + x1 += 65536 + (128 << 17); + x2 += 65536 + (128 << 17); + x3 += 65536 + (128 << 17); + // tried computing the shifts into temps, or'ing the temps to see + // if any were out of range, but that was slower + o[0] = stbi__clamp((x0 + t3) >> 17); + o[7] = stbi__clamp((x0 - t3) >> 17); + o[1] = stbi__clamp((x1 + t2) >> 17); + o[6] = stbi__clamp((x1 - t2) >> 17); + o[2] = stbi__clamp((x2 + t1) >> 17); + o[5] = stbi__clamp((x2 - t1) >> 17); + o[3] = stbi__clamp((x3 + t0) >> 17); + o[4] = stbi__clamp((x3 - t0) >> 17); + } } #ifdef STBI_SSE2 // sse2 integer IDCT. not the fastest possible implementation but it // produces bit-identical results to the generic C version so it's // fully "transparent". -static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) -{ - // This is constructed to match our regular (generic) integer IDCT exactly. - __m128i row0, row1, row2, row3, row4, row5, row6, row7; - __m128i tmp; +static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) { + // This is constructed to match our regular (generic) integer IDCT exactly. + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i tmp; - // dot product constant: even elems=x, odd elems=y - #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y)) +// dot product constant: even elems=x, odd elems=y +#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y)) - // out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) - // out(1) = c1[even]*x + c1[odd]*y - #define dct_rot(out0,out1, x,y,c0,c1) \ - __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \ - __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \ - __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ - __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ - __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ - __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) +// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) +// out(1) = c1[even]*x + c1[odd]*y +#define dct_rot(out0, out1, x, y, c0, c1) \ + __m128i c0##lo = _mm_unpacklo_epi16((x), (y)); \ + __m128i c0##hi = _mm_unpackhi_epi16((x), (y)); \ + __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ + __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ + __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ + __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) - // out = in << 12 (in 16-bit, out 32-bit) - #define dct_widen(out, in) \ - __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ - __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) +// out = in << 12 (in 16-bit, out 32-bit) +#define dct_widen(out, in) \ + __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ + __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) - // wide add - #define dct_wadd(out, a, b) \ - __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ - __m128i out##_h = _mm_add_epi32(a##_h, b##_h) +// wide add +#define dct_wadd(out, a, b) \ + __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_add_epi32(a##_h, b##_h) - // wide sub - #define dct_wsub(out, a, b) \ - __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ - __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) +// wide sub +#define dct_wsub(out, a, b) \ + __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) - // butterfly a/b, add bias, then shift by "s" and pack - #define dct_bfly32o(out0, out1, a,b,bias,s) \ - { \ - __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ - __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ - dct_wadd(sum, abiased, b); \ - dct_wsub(dif, abiased, b); \ - out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ - out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ - } +// butterfly a/b, add bias, then shift by "s" and pack +#define dct_bfly32o(out0, out1, a, b, bias, s) \ + { \ + __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ + __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ + dct_wadd(sum, abiased, b); \ + dct_wsub(dif, abiased, b); \ + out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ + out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ + } - // 8-bit interleave step (for transposes) - #define dct_interleave8(a, b) \ - tmp = a; \ - a = _mm_unpacklo_epi8(a, b); \ - b = _mm_unpackhi_epi8(tmp, b) +// 8-bit interleave step (for transposes) +#define dct_interleave8(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi8(a, b); \ + b = _mm_unpackhi_epi8(tmp, b) - // 16-bit interleave step (for transposes) - #define dct_interleave16(a, b) \ - tmp = a; \ - a = _mm_unpacklo_epi16(a, b); \ - b = _mm_unpackhi_epi16(tmp, b) +// 16-bit interleave step (for transposes) +#define dct_interleave16(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi16(a, b); \ + b = _mm_unpackhi_epi16(tmp, b) - #define dct_pass(bias,shift) \ - { \ - /* even part */ \ - dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \ - __m128i sum04 = _mm_add_epi16(row0, row4); \ - __m128i dif04 = _mm_sub_epi16(row0, row4); \ - dct_widen(t0e, sum04); \ - dct_widen(t1e, dif04); \ - dct_wadd(x0, t0e, t3e); \ - dct_wsub(x3, t0e, t3e); \ - dct_wadd(x1, t1e, t2e); \ - dct_wsub(x2, t1e, t2e); \ - /* odd part */ \ - dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \ - dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \ - __m128i sum17 = _mm_add_epi16(row1, row7); \ - __m128i sum35 = _mm_add_epi16(row3, row5); \ - dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \ - dct_wadd(x4, y0o, y4o); \ - dct_wadd(x5, y1o, y5o); \ - dct_wadd(x6, y2o, y5o); \ - dct_wadd(x7, y3o, y4o); \ - dct_bfly32o(row0,row7, x0,x7,bias,shift); \ - dct_bfly32o(row1,row6, x1,x6,bias,shift); \ - dct_bfly32o(row2,row5, x2,x5,bias,shift); \ - dct_bfly32o(row3,row4, x3,x4,bias,shift); \ - } +#define dct_pass(bias, shift) \ + { \ + /* even part */ \ + dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1); \ + __m128i sum04 = _mm_add_epi16(row0, row4); \ + __m128i dif04 = _mm_sub_epi16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1); \ + dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1); \ + __m128i sum17 = _mm_add_epi16(row1, row7); \ + __m128i sum35 = _mm_add_epi16(row3, row5); \ + dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1); \ + dct_wadd(x4, y0o, y4o); \ + dct_wadd(x5, y1o, y5o); \ + dct_wadd(x6, y2o, y5o); \ + dct_wadd(x7, y3o, y4o); \ + dct_bfly32o(row0, row7, x0, x7, bias, shift); \ + dct_bfly32o(row1, row6, x1, x6, bias, shift); \ + dct_bfly32o(row2, row5, x2, x5, bias, shift); \ + dct_bfly32o(row3, row4, x3, x4, bias, shift); \ + } - __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f)); - __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f)); - __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f)); - __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f)); - __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f)); - __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f)); - __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f)); - __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f)); + __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f)); + __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f), stbi__f2f(0.5411961f)); + __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f)); + __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f)); + __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f), stbi__f2f(-1.961570560f)); + __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f)); + __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f), stbi__f2f(-0.390180644f)); + __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f)); - // rounding biases in column/row passes, see stbi__idct_block for explanation. - __m128i bias_0 = _mm_set1_epi32(512); - __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17)); + // rounding biases in column/row passes, see stbi__idct_block for explanation. + __m128i bias_0 = _mm_set1_epi32(512); + __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17)); - // load - row0 = _mm_load_si128((const __m128i *) (data + 0*8)); - row1 = _mm_load_si128((const __m128i *) (data + 1*8)); - row2 = _mm_load_si128((const __m128i *) (data + 2*8)); - row3 = _mm_load_si128((const __m128i *) (data + 3*8)); - row4 = _mm_load_si128((const __m128i *) (data + 4*8)); - row5 = _mm_load_si128((const __m128i *) (data + 5*8)); - row6 = _mm_load_si128((const __m128i *) (data + 6*8)); - row7 = _mm_load_si128((const __m128i *) (data + 7*8)); + // load + row0 = _mm_load_si128((const __m128i *)(data + 0 * 8)); + row1 = _mm_load_si128((const __m128i *)(data + 1 * 8)); + row2 = _mm_load_si128((const __m128i *)(data + 2 * 8)); + row3 = _mm_load_si128((const __m128i *)(data + 3 * 8)); + row4 = _mm_load_si128((const __m128i *)(data + 4 * 8)); + row5 = _mm_load_si128((const __m128i *)(data + 5 * 8)); + row6 = _mm_load_si128((const __m128i *)(data + 6 * 8)); + row7 = _mm_load_si128((const __m128i *)(data + 7 * 8)); - // column pass - dct_pass(bias_0, 10); + // column pass + dct_pass(bias_0, 10); - { - // 16bit 8x8 transpose pass 1 - dct_interleave16(row0, row4); - dct_interleave16(row1, row5); - dct_interleave16(row2, row6); - dct_interleave16(row3, row7); + { + // 16bit 8x8 transpose pass 1 + dct_interleave16(row0, row4); + dct_interleave16(row1, row5); + dct_interleave16(row2, row6); + dct_interleave16(row3, row7); - // transpose pass 2 - dct_interleave16(row0, row2); - dct_interleave16(row1, row3); - dct_interleave16(row4, row6); - dct_interleave16(row5, row7); + // transpose pass 2 + dct_interleave16(row0, row2); + dct_interleave16(row1, row3); + dct_interleave16(row4, row6); + dct_interleave16(row5, row7); - // transpose pass 3 - dct_interleave16(row0, row1); - dct_interleave16(row2, row3); - dct_interleave16(row4, row5); - dct_interleave16(row6, row7); - } + // transpose pass 3 + dct_interleave16(row0, row1); + dct_interleave16(row2, row3); + dct_interleave16(row4, row5); + dct_interleave16(row6, row7); + } - // row pass - dct_pass(bias_1, 17); + // row pass + dct_pass(bias_1, 17); - { - // pack - __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 - __m128i p1 = _mm_packus_epi16(row2, row3); - __m128i p2 = _mm_packus_epi16(row4, row5); - __m128i p3 = _mm_packus_epi16(row6, row7); + { + // pack + __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 + __m128i p1 = _mm_packus_epi16(row2, row3); + __m128i p2 = _mm_packus_epi16(row4, row5); + __m128i p3 = _mm_packus_epi16(row6, row7); - // 8bit 8x8 transpose pass 1 - dct_interleave8(p0, p2); // a0e0a1e1... - dct_interleave8(p1, p3); // c0g0c1g1... + // 8bit 8x8 transpose pass 1 + dct_interleave8(p0, p2); // a0e0a1e1... + dct_interleave8(p1, p3); // c0g0c1g1... - // transpose pass 2 - dct_interleave8(p0, p1); // a0c0e0g0... - dct_interleave8(p2, p3); // b0d0f0h0... + // transpose pass 2 + dct_interleave8(p0, p1); // a0c0e0g0... + dct_interleave8(p2, p3); // b0d0f0h0... - // transpose pass 3 - dct_interleave8(p0, p2); // a0b0c0d0... - dct_interleave8(p1, p3); // a4b4c4d4... + // transpose pass 3 + dct_interleave8(p0, p2); // a0b0c0d0... + dct_interleave8(p1, p3); // a4b4c4d4... - // store - _mm_storel_epi64((__m128i *) out, p0); out += out_stride; - _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; - _mm_storel_epi64((__m128i *) out, p2); out += out_stride; - _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; - _mm_storel_epi64((__m128i *) out, p1); out += out_stride; - _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; - _mm_storel_epi64((__m128i *) out, p3); out += out_stride; - _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e)); - } + // store + _mm_storel_epi64((__m128i *)out, p0); + out += out_stride; + _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e)); + out += out_stride; + _mm_storel_epi64((__m128i *)out, p2); + out += out_stride; + _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e)); + out += out_stride; + _mm_storel_epi64((__m128i *)out, p1); + out += out_stride; + _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e)); + out += out_stride; + _mm_storel_epi64((__m128i *)out, p3); + out += out_stride; + _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e)); + } #undef dct_const #undef dct_rot @@ -2708,198 +2763,235 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) // NEON integer IDCT. should produce bit-identical // results to the generic C version. -static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) -{ - int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; +static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) { + int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; - int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f)); - int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f)); - int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f)); - int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f)); - int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f)); - int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f)); - int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f)); - int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f)); - int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f)); - int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f)); - int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f)); - int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f)); + int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f)); + int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f)); + int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f)); + int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f)); + int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f)); + int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f)); + int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f)); + int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f)); + int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f)); + int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f)); + int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f)); + int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f)); -#define dct_long_mul(out, inq, coeff) \ - int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ - int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) +#define dct_long_mul(out, inq, coeff) \ + int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) -#define dct_long_mac(out, acc, inq, coeff) \ - int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ - int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) +#define dct_long_mac(out, acc, inq, coeff) \ + int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) -#define dct_widen(out, inq) \ - int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ - int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) +#define dct_widen(out, inq) \ + int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ + int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) // wide add -#define dct_wadd(out, a, b) \ - int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ - int32x4_t out##_h = vaddq_s32(a##_h, b##_h) +#define dct_wadd(out, a, b) \ + int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vaddq_s32(a##_h, b##_h) // wide sub -#define dct_wsub(out, a, b) \ - int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ - int32x4_t out##_h = vsubq_s32(a##_h, b##_h) +#define dct_wsub(out, a, b) \ + int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vsubq_s32(a##_h, b##_h) // butterfly a/b, then shift using "shiftop" by "s" and pack -#define dct_bfly32o(out0,out1, a,b,shiftop,s) \ - { \ - dct_wadd(sum, a, b); \ - dct_wsub(dif, a, b); \ - out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ - out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ - } +#define dct_bfly32o(out0, out1, a, b, shiftop, s) \ + { \ + dct_wadd(sum, a, b); \ + dct_wsub(dif, a, b); \ + out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ + out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ + } -#define dct_pass(shiftop, shift) \ - { \ - /* even part */ \ - int16x8_t sum26 = vaddq_s16(row2, row6); \ - dct_long_mul(p1e, sum26, rot0_0); \ - dct_long_mac(t2e, p1e, row6, rot0_1); \ - dct_long_mac(t3e, p1e, row2, rot0_2); \ - int16x8_t sum04 = vaddq_s16(row0, row4); \ - int16x8_t dif04 = vsubq_s16(row0, row4); \ - dct_widen(t0e, sum04); \ - dct_widen(t1e, dif04); \ - dct_wadd(x0, t0e, t3e); \ - dct_wsub(x3, t0e, t3e); \ - dct_wadd(x1, t1e, t2e); \ - dct_wsub(x2, t1e, t2e); \ - /* odd part */ \ - int16x8_t sum15 = vaddq_s16(row1, row5); \ - int16x8_t sum17 = vaddq_s16(row1, row7); \ - int16x8_t sum35 = vaddq_s16(row3, row5); \ - int16x8_t sum37 = vaddq_s16(row3, row7); \ - int16x8_t sumodd = vaddq_s16(sum17, sum35); \ - dct_long_mul(p5o, sumodd, rot1_0); \ - dct_long_mac(p1o, p5o, sum17, rot1_1); \ - dct_long_mac(p2o, p5o, sum35, rot1_2); \ - dct_long_mul(p3o, sum37, rot2_0); \ - dct_long_mul(p4o, sum15, rot2_1); \ - dct_wadd(sump13o, p1o, p3o); \ - dct_wadd(sump24o, p2o, p4o); \ - dct_wadd(sump23o, p2o, p3o); \ - dct_wadd(sump14o, p1o, p4o); \ - dct_long_mac(x4, sump13o, row7, rot3_0); \ - dct_long_mac(x5, sump24o, row5, rot3_1); \ - dct_long_mac(x6, sump23o, row3, rot3_2); \ - dct_long_mac(x7, sump14o, row1, rot3_3); \ - dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \ - dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \ - dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \ - dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \ - } +#define dct_pass(shiftop, shift) \ + { \ + /* even part */ \ + int16x8_t sum26 = vaddq_s16(row2, row6); \ + dct_long_mul(p1e, sum26, rot0_0); \ + dct_long_mac(t2e, p1e, row6, rot0_1); \ + dct_long_mac(t3e, p1e, row2, rot0_2); \ + int16x8_t sum04 = vaddq_s16(row0, row4); \ + int16x8_t dif04 = vsubq_s16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + int16x8_t sum15 = vaddq_s16(row1, row5); \ + int16x8_t sum17 = vaddq_s16(row1, row7); \ + int16x8_t sum35 = vaddq_s16(row3, row5); \ + int16x8_t sum37 = vaddq_s16(row3, row7); \ + int16x8_t sumodd = vaddq_s16(sum17, sum35); \ + dct_long_mul(p5o, sumodd, rot1_0); \ + dct_long_mac(p1o, p5o, sum17, rot1_1); \ + dct_long_mac(p2o, p5o, sum35, rot1_2); \ + dct_long_mul(p3o, sum37, rot2_0); \ + dct_long_mul(p4o, sum15, rot2_1); \ + dct_wadd(sump13o, p1o, p3o); \ + dct_wadd(sump24o, p2o, p4o); \ + dct_wadd(sump23o, p2o, p3o); \ + dct_wadd(sump14o, p1o, p4o); \ + dct_long_mac(x4, sump13o, row7, rot3_0); \ + dct_long_mac(x5, sump24o, row5, rot3_1); \ + dct_long_mac(x6, sump23o, row3, rot3_2); \ + dct_long_mac(x7, sump14o, row1, rot3_3); \ + dct_bfly32o(row0, row7, x0, x7, shiftop, shift); \ + dct_bfly32o(row1, row6, x1, x6, shiftop, shift); \ + dct_bfly32o(row2, row5, x2, x5, shiftop, shift); \ + dct_bfly32o(row3, row4, x3, x4, shiftop, shift); \ + } - // load - row0 = vld1q_s16(data + 0*8); - row1 = vld1q_s16(data + 1*8); - row2 = vld1q_s16(data + 2*8); - row3 = vld1q_s16(data + 3*8); - row4 = vld1q_s16(data + 4*8); - row5 = vld1q_s16(data + 5*8); - row6 = vld1q_s16(data + 6*8); - row7 = vld1q_s16(data + 7*8); + // load + row0 = vld1q_s16(data + 0 * 8); + row1 = vld1q_s16(data + 1 * 8); + row2 = vld1q_s16(data + 2 * 8); + row3 = vld1q_s16(data + 3 * 8); + row4 = vld1q_s16(data + 4 * 8); + row5 = vld1q_s16(data + 5 * 8); + row6 = vld1q_s16(data + 6 * 8); + row7 = vld1q_s16(data + 7 * 8); - // add DC bias - row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); + // add DC bias + row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); - // column pass - dct_pass(vrshrn_n_s32, 10); + // column pass + dct_pass(vrshrn_n_s32, 10); - // 16bit 8x8 transpose - { + // 16bit 8x8 transpose + { // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively. // whether compilers actually get this is another story, sadly. -#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; } -#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); } -#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); } +#define dct_trn16(x, y) \ + { \ + int16x8x2_t t = vtrnq_s16(x, y); \ + x = t.val[0]; \ + y = t.val[1]; \ + } +#define dct_trn32(x, y) \ + { \ + int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); \ + x = vreinterpretq_s16_s32(t.val[0]); \ + y = vreinterpretq_s16_s32(t.val[1]); \ + } +#define dct_trn64(x, y) \ + { \ + int16x8_t x0 = x; \ + int16x8_t y0 = y; \ + x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); \ + y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); \ + } - // pass 1 - dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 - dct_trn16(row2, row3); - dct_trn16(row4, row5); - dct_trn16(row6, row7); + // pass 1 + dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 + dct_trn16(row2, row3); + dct_trn16(row4, row5); + dct_trn16(row6, row7); - // pass 2 - dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 - dct_trn32(row1, row3); - dct_trn32(row4, row6); - dct_trn32(row5, row7); + // pass 2 + dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 + dct_trn32(row1, row3); + dct_trn32(row4, row6); + dct_trn32(row5, row7); - // pass 3 - dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 - dct_trn64(row1, row5); - dct_trn64(row2, row6); - dct_trn64(row3, row7); + // pass 3 + dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 + dct_trn64(row1, row5); + dct_trn64(row2, row6); + dct_trn64(row3, row7); #undef dct_trn16 #undef dct_trn32 #undef dct_trn64 - } + } - // row pass - // vrshrn_n_s32 only supports shifts up to 16, we need - // 17. so do a non-rounding shift of 16 first then follow - // up with a rounding shift by 1. - dct_pass(vshrn_n_s32, 16); + // row pass + // vrshrn_n_s32 only supports shifts up to 16, we need + // 17. so do a non-rounding shift of 16 first then follow + // up with a rounding shift by 1. + dct_pass(vshrn_n_s32, 16); - { - // pack and round - uint8x8_t p0 = vqrshrun_n_s16(row0, 1); - uint8x8_t p1 = vqrshrun_n_s16(row1, 1); - uint8x8_t p2 = vqrshrun_n_s16(row2, 1); - uint8x8_t p3 = vqrshrun_n_s16(row3, 1); - uint8x8_t p4 = vqrshrun_n_s16(row4, 1); - uint8x8_t p5 = vqrshrun_n_s16(row5, 1); - uint8x8_t p6 = vqrshrun_n_s16(row6, 1); - uint8x8_t p7 = vqrshrun_n_s16(row7, 1); + { + // pack and round + uint8x8_t p0 = vqrshrun_n_s16(row0, 1); + uint8x8_t p1 = vqrshrun_n_s16(row1, 1); + uint8x8_t p2 = vqrshrun_n_s16(row2, 1); + uint8x8_t p3 = vqrshrun_n_s16(row3, 1); + uint8x8_t p4 = vqrshrun_n_s16(row4, 1); + uint8x8_t p5 = vqrshrun_n_s16(row5, 1); + uint8x8_t p6 = vqrshrun_n_s16(row6, 1); + uint8x8_t p7 = vqrshrun_n_s16(row7, 1); - // again, these can translate into one instruction, but often don't. -#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; } -#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); } -#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); } + // again, these can translate into one instruction, but often don't. +#define dct_trn8_8(x, y) \ + { \ + uint8x8x2_t t = vtrn_u8(x, y); \ + x = t.val[0]; \ + y = t.val[1]; \ + } +#define dct_trn8_16(x, y) \ + { \ + uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \ + x = vreinterpret_u8_u16(t.val[0]); \ + y = vreinterpret_u8_u16(t.val[1]); \ + } +#define dct_trn8_32(x, y) \ + { \ + uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \ + x = vreinterpret_u8_u32(t.val[0]); \ + y = vreinterpret_u8_u32(t.val[1]); \ + } - // sadly can't use interleaved stores here since we only write - // 8 bytes to each scan line! + // sadly can't use interleaved stores here since we only write + // 8 bytes to each scan line! - // 8x8 8-bit transpose pass 1 - dct_trn8_8(p0, p1); - dct_trn8_8(p2, p3); - dct_trn8_8(p4, p5); - dct_trn8_8(p6, p7); + // 8x8 8-bit transpose pass 1 + dct_trn8_8(p0, p1); + dct_trn8_8(p2, p3); + dct_trn8_8(p4, p5); + dct_trn8_8(p6, p7); - // pass 2 - dct_trn8_16(p0, p2); - dct_trn8_16(p1, p3); - dct_trn8_16(p4, p6); - dct_trn8_16(p5, p7); + // pass 2 + dct_trn8_16(p0, p2); + dct_trn8_16(p1, p3); + dct_trn8_16(p4, p6); + dct_trn8_16(p5, p7); - // pass 3 - dct_trn8_32(p0, p4); - dct_trn8_32(p1, p5); - dct_trn8_32(p2, p6); - dct_trn8_32(p3, p7); + // pass 3 + dct_trn8_32(p0, p4); + dct_trn8_32(p1, p5); + dct_trn8_32(p2, p6); + dct_trn8_32(p3, p7); - // store - vst1_u8(out, p0); out += out_stride; - vst1_u8(out, p1); out += out_stride; - vst1_u8(out, p2); out += out_stride; - vst1_u8(out, p3); out += out_stride; - vst1_u8(out, p4); out += out_stride; - vst1_u8(out, p5); out += out_stride; - vst1_u8(out, p6); out += out_stride; - vst1_u8(out, p7); + // store + vst1_u8(out, p0); + out += out_stride; + vst1_u8(out, p1); + out += out_stride; + vst1_u8(out, p2); + out += out_stride; + vst1_u8(out, p3); + out += out_stride; + vst1_u8(out, p4); + out += out_stride; + vst1_u8(out, p5); + out += out_stride; + vst1_u8(out, p6); + out += out_stride; + vst1_u8(out, p7); #undef dct_trn8_8 #undef dct_trn8_16 #undef dct_trn8_32 - } + } #undef dct_long_mul #undef dct_long_mac @@ -2912,1169 +3004,1267 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) #endif // STBI_NEON -#define STBI__MARKER_none 0xff +#define STBI__MARKER_none 0xff // if there's a pending marker from the entropy stream, return that // otherwise, fetch from the stream and get a marker. if there's no // marker, return 0xff, which is never a valid marker value -static stbi_uc stbi__get_marker(stbi__jpeg *j) -{ - stbi_uc x; - if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; } - x = stbi__get8(j->s); - if (x != 0xff) return STBI__MARKER_none; - while (x == 0xff) - x = stbi__get8(j->s); // consume repeated 0xff fill bytes - return x; +static stbi_uc stbi__get_marker(stbi__jpeg * j) { + stbi_uc x; + if (j->marker != STBI__MARKER_none) { + x = j->marker; + j->marker = STBI__MARKER_none; + return x; + } + x = stbi__get8(j->s); + if (x != 0xff) + return STBI__MARKER_none; + while (x == 0xff) + x = stbi__get8(j->s); // consume repeated 0xff fill bytes + return x; } // in each scan, we'll have scan_n components, and the order // of the components is specified by order[] -#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) +#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) // after a restart interval, stbi__jpeg_reset the entropy decoder and // the dc prediction -static void stbi__jpeg_reset(stbi__jpeg *j) -{ - j->code_bits = 0; - j->code_buffer = 0; - j->nomore = 0; - j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0; - j->marker = STBI__MARKER_none; - j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; - j->eob_run = 0; - // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, - // since we don't even allow 1<<30 pixels +static void stbi__jpeg_reset(stbi__jpeg * j) { + j->code_bits = 0; + j->code_buffer = 0; + j->nomore = 0; + j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0; + j->marker = STBI__MARKER_none; + j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; + j->eob_run = 0; + // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, + // since we don't even allow 1<<30 pixels } -static int stbi__parse_entropy_coded_data(stbi__jpeg *z) -{ - stbi__jpeg_reset(z); - if (!z->progressive) { - if (z->scan_n == 1) { - int i,j; - STBI_SIMD_ALIGN(short, data[64]); - int n = z->order[0]; - // non-interleaved data, we just need to process one block at a time, - // in trivial scanline order - // number of blocks to do just depends on how many actual "pixels" this - // component has, independent of interleaved MCU blocking and such - int w = (z->img_comp[n].x+7) >> 3; - int h = (z->img_comp[n].y+7) >> 3; - for (j=0; j < h; ++j) { - for (i=0; i < w; ++i) { - int ha = z->img_comp[n].ha; - if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; - z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data); - // every data block is an MCU, so countdown the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - // if it's NOT a restart, then just bail, so we get corrupt data - // rather than no data - if (!STBI__RESTART(z->marker)) return 1; - stbi__jpeg_reset(z); - } +static int stbi__parse_entropy_coded_data(stbi__jpeg * z) { + stbi__jpeg_reset(z); + if (!z->progressive) { + if (z->scan_n == 1) { + int i, j; + STBI_SIMD_ALIGN(short, data[64]); + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + int ha = z->img_comp[n].ha; + if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, + z->dequant[z->img_comp[n].tq])) + return 0; + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) + stbi__grow_buffer_unsafe(z); + // if it's NOT a restart, then just bail, so we get corrupt data + // rather than no data + if (!STBI__RESTART(z->marker)) + return 1; + stbi__jpeg_reset(z); + } + } } - } - return 1; - } else { // interleaved - int i,j,k,x,y; - STBI_SIMD_ALIGN(short, data[64]); - for (j=0; j < z->img_mcu_y; ++j) { - for (i=0; i < z->img_mcu_x; ++i) { - // scan an interleaved mcu... process scan_n components in order - for (k=0; k < z->scan_n; ++k) { - int n = z->order[k]; - // scan out an mcu's worth of this component; that's just determined - // by the basic H and V specified for the component - for (y=0; y < z->img_comp[n].v; ++y) { - for (x=0; x < z->img_comp[n].h; ++x) { - int x2 = (i*z->img_comp[n].h + x)*8; - int y2 = (j*z->img_comp[n].v + y)*8; - int ha = z->img_comp[n].ha; - if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; - z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data); - } - } - } - // after all interleaved components, that's an interleaved MCU, - // so now count down the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if (!STBI__RESTART(z->marker)) return 1; - stbi__jpeg_reset(z); - } + return 1; + } else { // interleaved + int i, j, k, x, y; + STBI_SIMD_ALIGN(short, data[64]); + for (j = 0; j < z->img_mcu_y; ++j) { + for (i = 0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k = 0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y = 0; y < z->img_comp[n].v; ++y) { + for (x = 0; x < z->img_comp[n].h; ++x) { + int x2 = (i * z->img_comp[n].h + x) * 8; + int y2 = (j * z->img_comp[n].v + y) * 8; + int ha = z->img_comp[n].ha; + if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, + z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) + return 0; + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, + data); + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) + stbi__grow_buffer_unsafe(z); + if (!STBI__RESTART(z->marker)) + return 1; + stbi__jpeg_reset(z); + } + } } - } - return 1; - } - } else { - if (z->scan_n == 1) { - int i,j; - int n = z->order[0]; - // non-interleaved data, we just need to process one block at a time, - // in trivial scanline order - // number of blocks to do just depends on how many actual "pixels" this - // component has, independent of interleaved MCU blocking and such - int w = (z->img_comp[n].x+7) >> 3; - int h = (z->img_comp[n].y+7) >> 3; - for (j=0; j < h; ++j) { - for (i=0; i < w; ++i) { - short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); - if (z->spec_start == 0) { - if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) - return 0; - } else { - int ha = z->img_comp[n].ha; - if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) - return 0; - } - // every data block is an MCU, so countdown the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if (!STBI__RESTART(z->marker)) return 1; - stbi__jpeg_reset(z); - } - } - } - return 1; - } else { // interleaved - int i,j,k,x,y; - for (j=0; j < z->img_mcu_y; ++j) { - for (i=0; i < z->img_mcu_x; ++i) { - // scan an interleaved mcu... process scan_n components in order - for (k=0; k < z->scan_n; ++k) { - int n = z->order[k]; - // scan out an mcu's worth of this component; that's just determined - // by the basic H and V specified for the component - for (y=0; y < z->img_comp[n].v; ++y) { - for (x=0; x < z->img_comp[n].h; ++x) { - int x2 = (i*z->img_comp[n].h + x); - int y2 = (j*z->img_comp[n].v + y); - short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); + return 1; + } + } else { + if (z->scan_n == 1) { + int i, j; + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + short * data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + if (z->spec_start == 0) { if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) - return 0; - } - } - } - // after all interleaved components, that's an interleaved MCU, - // so now count down the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); - if (!STBI__RESTART(z->marker)) return 1; - stbi__jpeg_reset(z); - } + return 0; + } else { + int ha = z->img_comp[n].ha; + if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) + return 0; + } + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) + stbi__grow_buffer_unsafe(z); + if (!STBI__RESTART(z->marker)) + return 1; + stbi__jpeg_reset(z); + } + } } - } - return 1; - } - } -} - -static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant) -{ - int i; - for (i=0; i < 64; ++i) - data[i] *= dequant[i]; -} - -static void stbi__jpeg_finish(stbi__jpeg *z) -{ - if (z->progressive) { - // dequantize and idct the data - int i,j,n; - for (n=0; n < z->s->img_n; ++n) { - int w = (z->img_comp[n].x+7) >> 3; - int h = (z->img_comp[n].y+7) >> 3; - for (j=0; j < h; ++j) { - for (i=0; i < w; ++i) { - short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); - stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); - z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data); + return 1; + } else { // interleaved + int i, j, k, x, y; + for (j = 0; j < z->img_mcu_y; ++j) { + for (i = 0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k = 0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y = 0; y < z->img_comp[n].v; ++y) { + for (x = 0; x < z->img_comp[n].h; ++x) { + int x2 = (i * z->img_comp[n].h + x); + int y2 = (j * z->img_comp[n].v + y); + short * data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) + stbi__grow_buffer_unsafe(z); + if (!STBI__RESTART(z->marker)) + return 1; + stbi__jpeg_reset(z); + } + } } - } - } - } + return 1; + } + } } -static int stbi__process_marker(stbi__jpeg *z, int m) -{ - int L; - switch (m) { - case STBI__MARKER_none: // no marker found - return stbi__err("expected marker","Corrupt JPEG"); +static void stbi__jpeg_dequantize(short * data, stbi__uint16 * dequant) { + int i; + for (i = 0; i < 64; ++i) + data[i] *= dequant[i]; +} - case 0xDD: // DRI - specify restart interval - if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG"); - z->restart_interval = stbi__get16be(z->s); - return 1; +static void stbi__jpeg_finish(stbi__jpeg * z) { + if (z->progressive) { + // dequantize and idct the data + int i, j, n; + for (n = 0; n < z->s->img_n; ++n) { + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + short * data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); + } + } + } + } +} - case 0xDB: // DQT - define quantization table - L = stbi__get16be(z->s)-2; - while (L > 0) { +static int stbi__process_marker(stbi__jpeg * z, int m) { + int L; + switch (m) { + case STBI__MARKER_none: // no marker found + return stbi__err("expected marker", "Corrupt JPEG"); + + case 0xDD: // DRI - specify restart interval + if (stbi__get16be(z->s) != 4) + return stbi__err("bad DRI len", "Corrupt JPEG"); + z->restart_interval = stbi__get16be(z->s); + return 1; + + case 0xDB: // DQT - define quantization table + L = stbi__get16be(z->s) - 2; + while (L > 0) { int q = stbi__get8(z->s); int p = q >> 4, sixteen = (p != 0); - int t = q & 15,i; - if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG"); - if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG"); + int t = q & 15, i; + if (p != 0 && p != 1) + return stbi__err("bad DQT type", "Corrupt JPEG"); + if (t > 3) + return stbi__err("bad DQT table", "Corrupt JPEG"); - for (i=0; i < 64; ++i) - z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); + for (i = 0; i < 64; ++i) + z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); L -= (sixteen ? 129 : 65); - } - return L==0; + } + return L == 0; - case 0xC4: // DHT - define huffman table - L = stbi__get16be(z->s)-2; - while (L > 0) { - stbi_uc *v; - int sizes[16],i,n=0; + case 0xC4: // DHT - define huffman table + L = stbi__get16be(z->s) - 2; + while (L > 0) { + stbi_uc * v; + int sizes[16], i, n = 0; int q = stbi__get8(z->s); int tc = q >> 4; int th = q & 15; - if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG"); - for (i=0; i < 16; ++i) { - sizes[i] = stbi__get8(z->s); - n += sizes[i]; + if (tc > 1 || th > 3) + return stbi__err("bad DHT header", "Corrupt JPEG"); + for (i = 0; i < 16; ++i) { + sizes[i] = stbi__get8(z->s); + n += sizes[i]; } - if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values! + if (n > 256) + return stbi__err("bad DHT header", "Corrupt JPEG"); // Loop over i < n would write past end of values! L -= 17; if (tc == 0) { - if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0; - v = z->huff_dc[th].values; + if (!stbi__build_huffman(z->huff_dc + th, sizes)) + return 0; + v = z->huff_dc[th].values; } else { - if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0; - v = z->huff_ac[th].values; + if (!stbi__build_huffman(z->huff_ac + th, sizes)) + return 0; + v = z->huff_ac[th].values; } - for (i=0; i < n; ++i) - v[i] = stbi__get8(z->s); + for (i = 0; i < n; ++i) + v[i] = stbi__get8(z->s); if (tc != 0) - stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); + stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); L -= n; - } - return L==0; - } + } + return L == 0; + } - // check for comment block or APP blocks - if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { - L = stbi__get16be(z->s); - if (L < 2) { - if (m == 0xFE) - return stbi__err("bad COM len","Corrupt JPEG"); - else - return stbi__err("bad APP len","Corrupt JPEG"); - } - L -= 2; + // check for comment block or APP blocks + if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { + L = stbi__get16be(z->s); + if (L < 2) { + if (m == 0xFE) + return stbi__err("bad COM len", "Corrupt JPEG"); + else + return stbi__err("bad APP len", "Corrupt JPEG"); + } + L -= 2; - if (m == 0xE0 && L >= 5) { // JFIF APP0 segment - static const unsigned char tag[5] = {'J','F','I','F','\0'}; - int ok = 1; - int i; - for (i=0; i < 5; ++i) - if (stbi__get8(z->s) != tag[i]) - ok = 0; - L -= 5; - if (ok) - z->jfif = 1; - } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment - static const unsigned char tag[6] = {'A','d','o','b','e','\0'}; - int ok = 1; - int i; - for (i=0; i < 6; ++i) - if (stbi__get8(z->s) != tag[i]) - ok = 0; - L -= 6; - if (ok) { - stbi__get8(z->s); // version - stbi__get16be(z->s); // flags0 - stbi__get16be(z->s); // flags1 - z->app14_color_transform = stbi__get8(z->s); // color transform + if (m == 0xE0 && L >= 5) { // JFIF APP0 segment + static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'}; + int ok = 1; + int i; + for (i = 0; i < 5; ++i) + if (stbi__get8(z->s) != tag[i]) + ok = 0; + L -= 5; + if (ok) + z->jfif = 1; + } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment + static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'}; + int ok = 1; + int i; + for (i = 0; i < 6; ++i) + if (stbi__get8(z->s) != tag[i]) + ok = 0; L -= 6; - } - } + if (ok) { + stbi__get8(z->s); // version + stbi__get16be(z->s); // flags0 + stbi__get16be(z->s); // flags1 + z->app14_color_transform = stbi__get8(z->s); // color transform + L -= 6; + } + } - stbi__skip(z->s, L); - return 1; - } + stbi__skip(z->s, L); + return 1; + } - return stbi__err("unknown marker","Corrupt JPEG"); + return stbi__err("unknown marker", "Corrupt JPEG"); } // after we see SOS -static int stbi__process_scan_header(stbi__jpeg *z) -{ - int i; - int Ls = stbi__get16be(z->s); - z->scan_n = stbi__get8(z->s); - if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG"); - if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG"); - for (i=0; i < z->scan_n; ++i) { - int id = stbi__get8(z->s), which; - int q = stbi__get8(z->s); - for (which = 0; which < z->s->img_n; ++which) - if (z->img_comp[which].id == id) - break; - if (which == z->s->img_n) return 0; // no match - z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG"); - z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG"); - z->order[i] = which; - } +static int stbi__process_scan_header(stbi__jpeg * z) { + int i; + int Ls = stbi__get16be(z->s); + z->scan_n = stbi__get8(z->s); + if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) + return stbi__err("bad SOS component count", "Corrupt JPEG"); + if (Ls != 6 + 2 * z->scan_n) + return stbi__err("bad SOS len", "Corrupt JPEG"); + for (i = 0; i < z->scan_n; ++i) { + int id = stbi__get8(z->s), which; + int q = stbi__get8(z->s); + for (which = 0; which < z->s->img_n; ++which) + if (z->img_comp[which].id == id) + break; + if (which == z->s->img_n) + return 0; // no match + z->img_comp[which].hd = q >> 4; + if (z->img_comp[which].hd > 3) + return stbi__err("bad DC huff", "Corrupt JPEG"); + z->img_comp[which].ha = q & 15; + if (z->img_comp[which].ha > 3) + return stbi__err("bad AC huff", "Corrupt JPEG"); + z->order[i] = which; + } - { - int aa; - z->spec_start = stbi__get8(z->s); - z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 - aa = stbi__get8(z->s); - z->succ_high = (aa >> 4); - z->succ_low = (aa & 15); - if (z->progressive) { - if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) - return stbi__err("bad SOS", "Corrupt JPEG"); - } else { - if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG"); - if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG"); - z->spec_end = 63; - } - } + { + int aa; + z->spec_start = stbi__get8(z->s); + z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 + aa = stbi__get8(z->s); + z->succ_high = (aa >> 4); + z->succ_low = (aa & 15); + if (z->progressive) { + if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) + return stbi__err("bad SOS", "Corrupt JPEG"); + } else { + if (z->spec_start != 0) + return stbi__err("bad SOS", "Corrupt JPEG"); + if (z->succ_high != 0 || z->succ_low != 0) + return stbi__err("bad SOS", "Corrupt JPEG"); + z->spec_end = 63; + } + } - return 1; + return 1; } -static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why) -{ - int i; - for (i=0; i < ncomp; ++i) { - if (z->img_comp[i].raw_data) { - STBI_FREE(z->img_comp[i].raw_data); - z->img_comp[i].raw_data = NULL; - z->img_comp[i].data = NULL; - } - if (z->img_comp[i].raw_coeff) { - STBI_FREE(z->img_comp[i].raw_coeff); - z->img_comp[i].raw_coeff = 0; - z->img_comp[i].coeff = 0; - } - if (z->img_comp[i].linebuf) { - STBI_FREE(z->img_comp[i].linebuf); - z->img_comp[i].linebuf = NULL; - } - } - return why; +static int stbi__free_jpeg_components(stbi__jpeg * z, int ncomp, int why) { + int i; + for (i = 0; i < ncomp; ++i) { + if (z->img_comp[i].raw_data) { + STBI_FREE(z->img_comp[i].raw_data); + z->img_comp[i].raw_data = NULL; + z->img_comp[i].data = NULL; + } + if (z->img_comp[i].raw_coeff) { + STBI_FREE(z->img_comp[i].raw_coeff); + z->img_comp[i].raw_coeff = 0; + z->img_comp[i].coeff = 0; + } + if (z->img_comp[i].linebuf) { + STBI_FREE(z->img_comp[i].linebuf); + z->img_comp[i].linebuf = NULL; + } + } + return why; } -static int stbi__process_frame_header(stbi__jpeg *z, int scan) -{ - stbi__context *s = z->s; - int Lf,p,i,q, h_max=1,v_max=1,c; - Lf = stbi__get16be(s); if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG - p = stbi__get8(s); if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline - s->img_y = stbi__get16be(s); if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG - s->img_x = stbi__get16be(s); if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires - if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); - if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); - c = stbi__get8(s); - if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG"); - s->img_n = c; - for (i=0; i < c; ++i) { - z->img_comp[i].data = NULL; - z->img_comp[i].linebuf = NULL; - } +static int stbi__process_frame_header(stbi__jpeg * z, int scan) { + stbi__context * s = z->s; + int Lf, p, i, q, h_max = 1, v_max = 1, c; + Lf = stbi__get16be(s); + if (Lf < 11) + return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG + p = stbi__get8(s); + if (p != 8) + return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline + s->img_y = stbi__get16be(s); + if (s->img_y == 0) + return stbi__err("no header height", + "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG + s->img_x = stbi__get16be(s); + if (s->img_x == 0) + return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires + if (s->img_y > STBI_MAX_DIMENSIONS) + return stbi__err("too large", "Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) + return stbi__err("too large", "Very large image (corrupt?)"); + c = stbi__get8(s); + if (c != 3 && c != 1 && c != 4) + return stbi__err("bad component count", "Corrupt JPEG"); + s->img_n = c; + for (i = 0; i < c; ++i) { + z->img_comp[i].data = NULL; + z->img_comp[i].linebuf = NULL; + } - if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG"); + if (Lf != 8 + 3 * s->img_n) + return stbi__err("bad SOF len", "Corrupt JPEG"); - z->rgb = 0; - for (i=0; i < s->img_n; ++i) { - static const unsigned char rgb[3] = { 'R', 'G', 'B' }; - z->img_comp[i].id = stbi__get8(s); - if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) - ++z->rgb; - q = stbi__get8(s); - z->img_comp[i].h = (q >> 4); if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG"); - z->img_comp[i].v = q & 15; if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG"); - z->img_comp[i].tq = stbi__get8(s); if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG"); - } + z->rgb = 0; + for (i = 0; i < s->img_n; ++i) { + static const unsigned char rgb[3] = {'R', 'G', 'B'}; + z->img_comp[i].id = stbi__get8(s); + if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) + ++z->rgb; + q = stbi__get8(s); + z->img_comp[i].h = (q >> 4); + if (!z->img_comp[i].h || z->img_comp[i].h > 4) + return stbi__err("bad H", "Corrupt JPEG"); + z->img_comp[i].v = q & 15; + if (!z->img_comp[i].v || z->img_comp[i].v > 4) + return stbi__err("bad V", "Corrupt JPEG"); + z->img_comp[i].tq = stbi__get8(s); + if (z->img_comp[i].tq > 3) + return stbi__err("bad TQ", "Corrupt JPEG"); + } - if (scan != STBI__SCAN_load) return 1; + if (scan != STBI__SCAN_load) + return 1; - if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode"); + if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) + return stbi__err("too large", "Image too large to decode"); - for (i=0; i < s->img_n; ++i) { - if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; - if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; - } + for (i = 0; i < s->img_n; ++i) { + if (z->img_comp[i].h > h_max) + h_max = z->img_comp[i].h; + if (z->img_comp[i].v > v_max) + v_max = z->img_comp[i].v; + } - // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios - // and I've never seen a non-corrupted JPEG file actually use them - for (i=0; i < s->img_n; ++i) { - if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG"); - if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG"); - } + // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios + // and I've never seen a non-corrupted JPEG file actually use them + for (i = 0; i < s->img_n; ++i) { + if (h_max % z->img_comp[i].h != 0) + return stbi__err("bad H", "Corrupt JPEG"); + if (v_max % z->img_comp[i].v != 0) + return stbi__err("bad V", "Corrupt JPEG"); + } - // compute interleaved mcu info - z->img_h_max = h_max; - z->img_v_max = v_max; - z->img_mcu_w = h_max * 8; - z->img_mcu_h = v_max * 8; - // these sizes can't be more than 17 bits - z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w; - z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h; + // compute interleaved mcu info + z->img_h_max = h_max; + z->img_v_max = v_max; + z->img_mcu_w = h_max * 8; + z->img_mcu_h = v_max * 8; + // these sizes can't be more than 17 bits + z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w; + z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h; - for (i=0; i < s->img_n; ++i) { - // number of effective pixels (e.g. for non-interleaved MCU) - z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max; - z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max; - // to simplify generation, we'll allocate enough memory to decode - // the bogus oversized data from using interleaved MCUs and their - // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't - // discard the extra data until colorspace conversion - // - // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier) - // so these muls can't overflow with 32-bit ints (which we require) - z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; - z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; - z->img_comp[i].coeff = 0; - z->img_comp[i].raw_coeff = 0; - z->img_comp[i].linebuf = NULL; - z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); - if (z->img_comp[i].raw_data == NULL) - return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory")); - // align blocks for idct using mmx/sse - z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15); - if (z->progressive) { - // w2, h2 are multiples of 8 (see above) - z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; - z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; - z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); - if (z->img_comp[i].raw_coeff == NULL) - return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory")); - z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15); - } - } + for (i = 0; i < s->img_n; ++i) { + // number of effective pixels (e.g. for non-interleaved MCU) + z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max; + z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max; + // to simplify generation, we'll allocate enough memory to decode + // the bogus oversized data from using interleaved MCUs and their + // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't + // discard the extra data until colorspace conversion + // + // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier) + // so these muls can't overflow with 32-bit ints (which we require) + z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; + z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; + z->img_comp[i].coeff = 0; + z->img_comp[i].raw_coeff = 0; + z->img_comp[i].linebuf = NULL; + z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); + if (z->img_comp[i].raw_data == NULL) + return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory")); + // align blocks for idct using mmx/sse + z->img_comp[i].data = (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15); + if (z->progressive) { + // w2, h2 are multiples of 8 (see above) + z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; + z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; + z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); + if (z->img_comp[i].raw_coeff == NULL) + return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory")); + z->img_comp[i].coeff = (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15); + } + } - return 1; + return 1; } // use comparisons since in some cases we handle more than one case (e.g. SOF) -#define stbi__DNL(x) ((x) == 0xdc) -#define stbi__SOI(x) ((x) == 0xd8) -#define stbi__EOI(x) ((x) == 0xd9) -#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) -#define stbi__SOS(x) ((x) == 0xda) +#define stbi__DNL(x) ((x) == 0xdc) +#define stbi__SOI(x) ((x) == 0xd8) +#define stbi__EOI(x) ((x) == 0xd9) +#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) +#define stbi__SOS(x) ((x) == 0xda) -#define stbi__SOF_progressive(x) ((x) == 0xc2) +#define stbi__SOF_progressive(x) ((x) == 0xc2) -static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) -{ - int m; - z->jfif = 0; - z->app14_color_transform = -1; // valid values are 0,1,2 - z->marker = STBI__MARKER_none; // initialize cached marker to empty - m = stbi__get_marker(z); - if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG"); - if (scan == STBI__SCAN_type) return 1; - m = stbi__get_marker(z); - while (!stbi__SOF(m)) { - if (!stbi__process_marker(z,m)) return 0; - m = stbi__get_marker(z); - while (m == STBI__MARKER_none) { - // some files have extra padding after their blocks, so ok, we'll scan - if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG"); - m = stbi__get_marker(z); - } - } - z->progressive = stbi__SOF_progressive(m); - if (!stbi__process_frame_header(z, scan)) return 0; - return 1; +static int stbi__decode_jpeg_header(stbi__jpeg * z, int scan) { + int m; + z->jfif = 0; + z->app14_color_transform = -1; // valid values are 0,1,2 + z->marker = STBI__MARKER_none; // initialize cached marker to empty + m = stbi__get_marker(z); + if (!stbi__SOI(m)) + return stbi__err("no SOI", "Corrupt JPEG"); + if (scan == STBI__SCAN_type) + return 1; + m = stbi__get_marker(z); + while (!stbi__SOF(m)) { + if (!stbi__process_marker(z, m)) + return 0; + m = stbi__get_marker(z); + while (m == STBI__MARKER_none) { + // some files have extra padding after their blocks, so ok, we'll scan + if (stbi__at_eof(z->s)) + return stbi__err("no SOF", "Corrupt JPEG"); + m = stbi__get_marker(z); + } + } + z->progressive = stbi__SOF_progressive(m); + if (!stbi__process_frame_header(z, scan)) + return 0; + return 1; } -static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j) -{ - // some JPEGs have junk at end, skip over it but if we find what looks - // like a valid marker, resume there - while (!stbi__at_eof(j->s)) { - stbi_uc x = stbi__get8(j->s); - while (x == 0xff) { // might be a marker - if (stbi__at_eof(j->s)) return STBI__MARKER_none; - x = stbi__get8(j->s); - if (x != 0x00 && x != 0xff) { - // not a stuffed zero or lead-in to another marker, looks - // like an actual marker, return it - return x; - } - // stuffed zero has x=0 now which ends the loop, meaning we go - // back to regular scan loop. - // repeated 0xff keeps trying to read the next byte of the marker. - } - } - return STBI__MARKER_none; +static int stbi__skip_jpeg_junk_at_end(stbi__jpeg * j) { + // some JPEGs have junk at end, skip over it but if we find what looks + // like a valid marker, resume there + while (!stbi__at_eof(j->s)) { + int x = stbi__get8(j->s); + while (x == 255) { // might be a marker + if (stbi__at_eof(j->s)) + return STBI__MARKER_none; + x = stbi__get8(j->s); + if (x != 0x00 && x != 0xff) { + // not a stuffed zero or lead-in to another marker, looks + // like an actual marker, return it + return x; + } + // stuffed zero has x=0 now which ends the loop, meaning we go + // back to regular scan loop. + // repeated 0xff keeps trying to read the next byte of the marker. + } + } + return STBI__MARKER_none; } // decode image to YCbCr format -static int stbi__decode_jpeg_image(stbi__jpeg *j) -{ - int m; - for (m = 0; m < 4; m++) { - j->img_comp[m].raw_data = NULL; - j->img_comp[m].raw_coeff = NULL; - } - j->restart_interval = 0; - if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0; - m = stbi__get_marker(j); - while (!stbi__EOI(m)) { - if (stbi__SOS(m)) { - if (!stbi__process_scan_header(j)) return 0; - if (!stbi__parse_entropy_coded_data(j)) return 0; - if (j->marker == STBI__MARKER_none ) { - j->marker = stbi__skip_jpeg_junk_at_end(j); - // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0 - } - m = stbi__get_marker(j); - if (STBI__RESTART(m)) +static int stbi__decode_jpeg_image(stbi__jpeg * j) { + int m; + for (m = 0; m < 4; m++) { + j->img_comp[m].raw_data = NULL; + j->img_comp[m].raw_coeff = NULL; + } + j->restart_interval = 0; + if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) + return 0; + m = stbi__get_marker(j); + while (!stbi__EOI(m)) { + if (stbi__SOS(m)) { + if (!stbi__process_scan_header(j)) + return 0; + if (!stbi__parse_entropy_coded_data(j)) + return 0; + if (j->marker == STBI__MARKER_none) { + j->marker = stbi__skip_jpeg_junk_at_end(j); + // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0 + } m = stbi__get_marker(j); - } else if (stbi__DNL(m)) { - int Ld = stbi__get16be(j->s); - stbi__uint32 NL = stbi__get16be(j->s); - if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG"); - if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG"); - m = stbi__get_marker(j); - } else { - if (!stbi__process_marker(j, m)) return 1; - m = stbi__get_marker(j); - } - } - if (j->progressive) - stbi__jpeg_finish(j); - return 1; + if (STBI__RESTART(m)) + m = stbi__get_marker(j); + } else if (stbi__DNL(m)) { + int Ld = stbi__get16be(j->s); + stbi__uint32 NL = stbi__get16be(j->s); + if (Ld != 4) + return stbi__err("bad DNL len", "Corrupt JPEG"); + if (NL != j->s->img_y) + return stbi__err("bad DNL height", "Corrupt JPEG"); + m = stbi__get_marker(j); + } else { + if (!stbi__process_marker(j, m)) + return 1; + m = stbi__get_marker(j); + } + } + if (j->progressive) + stbi__jpeg_finish(j); + return 1; } // static jfif-centered resampling (across block boundaries) -typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1, - int w, int hs); +typedef stbi_uc * (*resample_row_func)(stbi_uc * out, stbi_uc * in0, stbi_uc * in1, int w, int hs); -#define stbi__div4(x) ((stbi_uc) ((x) >> 2)) +#define stbi__div4(x) ((stbi_uc)((x) >> 2)) -static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs) -{ - STBI_NOTUSED(out); - STBI_NOTUSED(in_far); - STBI_NOTUSED(w); - STBI_NOTUSED(hs); - return in_near; +static stbi_uc * resample_row_1(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) { + STBI_NOTUSED(out); + STBI_NOTUSED(in_far); + STBI_NOTUSED(w); + STBI_NOTUSED(hs); + return in_near; } -static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs) -{ - // need to generate two samples vertically for every one in input - int i; - STBI_NOTUSED(hs); - for (i=0; i < w; ++i) - out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2); - return out; +static stbi_uc * stbi__resample_row_v_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) { + // need to generate two samples vertically for every one in input + int i; + STBI_NOTUSED(hs); + for (i = 0; i < w; ++i) + out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2); + return out; } -static stbi_uc* stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs) -{ - // need to generate two samples horizontally for every one in input - int i; - stbi_uc *input = in_near; +static stbi_uc * stbi__resample_row_h_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) { + // need to generate two samples horizontally for every one in input + int i; + stbi_uc * input = in_near; - if (w == 1) { - // if only one sample, can't do any interpolation - out[0] = out[1] = input[0]; - return out; - } + if (w == 1) { + // if only one sample, can't do any interpolation + out[0] = out[1] = input[0]; + return out; + } - out[0] = input[0]; - out[1] = stbi__div4(input[0]*3 + input[1] + 2); - for (i=1; i < w-1; ++i) { - int n = 3*input[i]+2; - out[i*2+0] = stbi__div4(n+input[i-1]); - out[i*2+1] = stbi__div4(n+input[i+1]); - } - out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2); - out[i*2+1] = input[w-1]; + out[0] = input[0]; + out[1] = stbi__div4(input[0] * 3 + input[1] + 2); + for (i = 1; i < w - 1; ++i) { + int n = 3 * input[i] + 2; + out[i * 2 + 0] = stbi__div4(n + input[i - 1]); + out[i * 2 + 1] = stbi__div4(n + input[i + 1]); + } + out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2); + out[i * 2 + 1] = input[w - 1]; - STBI_NOTUSED(in_far); - STBI_NOTUSED(hs); + STBI_NOTUSED(in_far); + STBI_NOTUSED(hs); - return out; + return out; } -#define stbi__div16(x) ((stbi_uc) ((x) >> 4)) +#define stbi__div16(x) ((stbi_uc)((x) >> 4)) -static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs) -{ - // need to generate 2x2 samples for every one in input - int i,t0,t1; - if (w == 1) { - out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2); - return out; - } +static stbi_uc * stbi__resample_row_hv_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) { + // need to generate 2x2 samples for every one in input + int i, t0, t1; + if (w == 1) { + out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); + return out; + } - t1 = 3*in_near[0] + in_far[0]; - out[0] = stbi__div4(t1+2); - for (i=1; i < w; ++i) { - t0 = t1; - t1 = 3*in_near[i]+in_far[i]; - out[i*2-1] = stbi__div16(3*t0 + t1 + 8); - out[i*2 ] = stbi__div16(3*t1 + t0 + 8); - } - out[w*2-1] = stbi__div4(t1+2); + t1 = 3 * in_near[0] + in_far[0]; + out[0] = stbi__div4(t1 + 2); + for (i = 1; i < w; ++i) { + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8); + out[i * 2] = stbi__div16(3 * t1 + t0 + 8); + } + out[w * 2 - 1] = stbi__div4(t1 + 2); - STBI_NOTUSED(hs); + STBI_NOTUSED(hs); - return out; + return out; } #if defined(STBI_SSE2) || defined(STBI_NEON) -static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs) -{ - // need to generate 2x2 samples for every one in input - int i=0,t0,t1; +static stbi_uc * stbi__resample_row_hv_2_simd(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) { + // need to generate 2x2 samples for every one in input + int i = 0, t0, t1; - if (w == 1) { - out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2); - return out; - } + if (w == 1) { + out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); + return out; + } - t1 = 3*in_near[0] + in_far[0]; - // process groups of 8 pixels for as long as we can. - // note we can't handle the last pixel in a row in this loop - // because we need to handle the filter boundary conditions. - for (; i < ((w-1) & ~7); i += 8) { + t1 = 3 * in_near[0] + in_far[0]; + // process groups of 8 pixels for as long as we can. + // note we can't handle the last pixel in a row in this loop + // because we need to handle the filter boundary conditions. + for (; i < ((w - 1) & ~7); i += 8) { #if defined(STBI_SSE2) - // load and perform the vertical filtering pass - // this uses 3*x + y = 4*x + (y - x) - __m128i zero = _mm_setzero_si128(); - __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i)); - __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i)); - __m128i farw = _mm_unpacklo_epi8(farb, zero); - __m128i nearw = _mm_unpacklo_epi8(nearb, zero); - __m128i diff = _mm_sub_epi16(farw, nearw); - __m128i nears = _mm_slli_epi16(nearw, 2); - __m128i curr = _mm_add_epi16(nears, diff); // current row + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + __m128i zero = _mm_setzero_si128(); + __m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i)); + __m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i)); + __m128i farw = _mm_unpacklo_epi8(farb, zero); + __m128i nearw = _mm_unpacklo_epi8(nearb, zero); + __m128i diff = _mm_sub_epi16(farw, nearw); + __m128i nears = _mm_slli_epi16(nearw, 2); + __m128i curr = _mm_add_epi16(nears, diff); // current row - // horizontal filter works the same based on shifted vers of current - // row. "prev" is current row shifted right by 1 pixel; we need to - // insert the previous pixel value (from t1). - // "next" is current row shifted left by 1 pixel, with first pixel - // of next block of 8 pixels added in. - __m128i prv0 = _mm_slli_si128(curr, 2); - __m128i nxt0 = _mm_srli_si128(curr, 2); - __m128i prev = _mm_insert_epi16(prv0, t1, 0); - __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7); + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + __m128i prv0 = _mm_slli_si128(curr, 2); + __m128i nxt0 = _mm_srli_si128(curr, 2); + __m128i prev = _mm_insert_epi16(prv0, t1, 0); + __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7); - // horizontal filter, polyphase implementation since it's convenient: - // even pixels = 3*cur + prev = cur*4 + (prev - cur) - // odd pixels = 3*cur + next = cur*4 + (next - cur) - // note the shared term. - __m128i bias = _mm_set1_epi16(8); - __m128i curs = _mm_slli_epi16(curr, 2); - __m128i prvd = _mm_sub_epi16(prev, curr); - __m128i nxtd = _mm_sub_epi16(next, curr); - __m128i curb = _mm_add_epi16(curs, bias); - __m128i even = _mm_add_epi16(prvd, curb); - __m128i odd = _mm_add_epi16(nxtd, curb); + // horizontal filter, polyphase implementation since it's convenient: + // even pixels = 3*cur + prev = cur*4 + (prev - cur) + // odd pixels = 3*cur + next = cur*4 + (next - cur) + // note the shared term. + __m128i bias = _mm_set1_epi16(8); + __m128i curs = _mm_slli_epi16(curr, 2); + __m128i prvd = _mm_sub_epi16(prev, curr); + __m128i nxtd = _mm_sub_epi16(next, curr); + __m128i curb = _mm_add_epi16(curs, bias); + __m128i even = _mm_add_epi16(prvd, curb); + __m128i odd = _mm_add_epi16(nxtd, curb); - // interleave even and odd pixels, then undo scaling. - __m128i int0 = _mm_unpacklo_epi16(even, odd); - __m128i int1 = _mm_unpackhi_epi16(even, odd); - __m128i de0 = _mm_srli_epi16(int0, 4); - __m128i de1 = _mm_srli_epi16(int1, 4); + // interleave even and odd pixels, then undo scaling. + __m128i int0 = _mm_unpacklo_epi16(even, odd); + __m128i int1 = _mm_unpackhi_epi16(even, odd); + __m128i de0 = _mm_srli_epi16(int0, 4); + __m128i de1 = _mm_srli_epi16(int1, 4); - // pack and write output - __m128i outv = _mm_packus_epi16(de0, de1); - _mm_storeu_si128((__m128i *) (out + i*2), outv); + // pack and write output + __m128i outv = _mm_packus_epi16(de0, de1); + _mm_storeu_si128((__m128i *)(out + i * 2), outv); #elif defined(STBI_NEON) - // load and perform the vertical filtering pass - // this uses 3*x + y = 4*x + (y - x) - uint8x8_t farb = vld1_u8(in_far + i); - uint8x8_t nearb = vld1_u8(in_near + i); - int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); - int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); - int16x8_t curr = vaddq_s16(nears, diff); // current row + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + uint8x8_t farb = vld1_u8(in_far + i); + uint8x8_t nearb = vld1_u8(in_near + i); + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); + int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); + int16x8_t curr = vaddq_s16(nears, diff); // current row - // horizontal filter works the same based on shifted vers of current - // row. "prev" is current row shifted right by 1 pixel; we need to - // insert the previous pixel value (from t1). - // "next" is current row shifted left by 1 pixel, with first pixel - // of next block of 8 pixels added in. - int16x8_t prv0 = vextq_s16(curr, curr, 7); - int16x8_t nxt0 = vextq_s16(curr, curr, 1); - int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); - int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7); + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + int16x8_t prv0 = vextq_s16(curr, curr, 7); + int16x8_t nxt0 = vextq_s16(curr, curr, 1); + int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); + int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7); - // horizontal filter, polyphase implementation since it's convenient: - // even pixels = 3*cur + prev = cur*4 + (prev - cur) - // odd pixels = 3*cur + next = cur*4 + (next - cur) - // note the shared term. - int16x8_t curs = vshlq_n_s16(curr, 2); - int16x8_t prvd = vsubq_s16(prev, curr); - int16x8_t nxtd = vsubq_s16(next, curr); - int16x8_t even = vaddq_s16(curs, prvd); - int16x8_t odd = vaddq_s16(curs, nxtd); + // horizontal filter, polyphase implementation since it's convenient: + // even pixels = 3*cur + prev = cur*4 + (prev - cur) + // odd pixels = 3*cur + next = cur*4 + (next - cur) + // note the shared term. + int16x8_t curs = vshlq_n_s16(curr, 2); + int16x8_t prvd = vsubq_s16(prev, curr); + int16x8_t nxtd = vsubq_s16(next, curr); + int16x8_t even = vaddq_s16(curs, prvd); + int16x8_t odd = vaddq_s16(curs, nxtd); - // undo scaling and round, then store with even/odd phases interleaved - uint8x8x2_t o; - o.val[0] = vqrshrun_n_s16(even, 4); - o.val[1] = vqrshrun_n_s16(odd, 4); - vst2_u8(out + i*2, o); + // undo scaling and round, then store with even/odd phases interleaved + uint8x8x2_t o; + o.val[0] = vqrshrun_n_s16(even, 4); + o.val[1] = vqrshrun_n_s16(odd, 4); + vst2_u8(out + i * 2, o); #endif - // "previous" value for next iter - t1 = 3*in_near[i+7] + in_far[i+7]; - } + // "previous" value for next iter + t1 = 3 * in_near[i + 7] + in_far[i + 7]; + } - t0 = t1; - t1 = 3*in_near[i] + in_far[i]; - out[i*2] = stbi__div16(3*t1 + t0 + 8); + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2] = stbi__div16(3 * t1 + t0 + 8); - for (++i; i < w; ++i) { - t0 = t1; - t1 = 3*in_near[i]+in_far[i]; - out[i*2-1] = stbi__div16(3*t0 + t1 + 8); - out[i*2 ] = stbi__div16(3*t1 + t0 + 8); - } - out[w*2-1] = stbi__div4(t1+2); + for (++i; i < w; ++i) { + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8); + out[i * 2] = stbi__div16(3 * t1 + t0 + 8); + } + out[w * 2 - 1] = stbi__div4(t1 + 2); - STBI_NOTUSED(hs); + STBI_NOTUSED(hs); - return out; + return out; } #endif -static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs) -{ - // resample with nearest-neighbor - int i,j; - STBI_NOTUSED(in_far); - for (i=0; i < w; ++i) - for (j=0; j < hs; ++j) - out[i*hs+j] = in_near[i]; - return out; +static stbi_uc * stbi__resample_row_generic(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) { + // resample with nearest-neighbor + int i, j; + STBI_NOTUSED(in_far); + for (i = 0; i < w; ++i) + for (j = 0; j < hs; ++j) + out[i * hs + j] = in_near[i]; + return out; } // this is a reduced-precision calculation of YCbCr-to-RGB introduced // to make sure the code produces the same results in both SIMD and scalar -#define stbi__float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) -static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step) -{ - int i; - for (i=0; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1<<19); // rounding - int r,g,b; - int cr = pcr[i] - 128; - int cb = pcb[i] - 128; - r = y_fixed + cr* stbi__float2fixed(1.40200f); - g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000); - b = y_fixed + cb* stbi__float2fixed(1.77200f); - r >>= 20; - g >>= 20; - b >>= 20; - if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } - if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } - if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } - out[0] = (stbi_uc)r; - out[1] = (stbi_uc)g; - out[2] = (stbi_uc)b; - out[3] = 255; - out += step; - } +#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8) +static void stbi__YCbCr_to_RGB_row(stbi_uc * out, const stbi_uc * y, const stbi_uc * pcb, const stbi_uc * pcr, int count, + int step) { + int i; + for (i = 0; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int r, g, b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr * stbi__float2fixed(1.40200f); + g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) + ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb * stbi__float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned)r > 255) { + if (r < 0) + r = 0; + else + r = 255; + } + if ((unsigned)g > 255) { + if (g < 0) + g = 0; + else + g = 255; + } + if ((unsigned)b > 255) { + if (b < 0) + b = 0; + else + b = 255; + } + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; + out[3] = 255; + out += step; + } } #if defined(STBI_SSE2) || defined(STBI_NEON) -static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step) -{ - int i = 0; +static void stbi__YCbCr_to_RGB_simd(stbi_uc * out, stbi_uc const * y, stbi_uc const * pcb, stbi_uc const * pcr, int count, + int step) { + int i = 0; #ifdef STBI_SSE2 - // step == 3 is pretty ugly on the final interleave, and i'm not convinced - // it's useful in practice (you wouldn't use it for textures, for example). - // so just accelerate step == 4 case. - if (step == 4) { - // this is a fairly straightforward implementation and not super-optimized. - __m128i signflip = _mm_set1_epi8(-0x80); - __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f)); - __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f)); - __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f)); - __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f)); - __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128); - __m128i xw = _mm_set1_epi16(255); // alpha channel + // step == 3 is pretty ugly on the final interleave, and i'm not convinced + // it's useful in practice (you wouldn't use it for textures, for example). + // so just accelerate step == 4 case. + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + __m128i signflip = _mm_set1_epi8(-0x80); + __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f)); + __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f)); + __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f)); + __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f)); + __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128); + __m128i xw = _mm_set1_epi16(255); // alpha channel - for (; i+7 < count; i += 8) { - // load - __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i)); - __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i)); - __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i)); - __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 - __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 + for (; i + 7 < count; i += 8) { + // load + __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i)); + __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i)); + __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i)); + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 - // unpack to short (and left-shift cr, cb by 8) - __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); - __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); - __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); + // unpack to short (and left-shift cr, cb by 8) + __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); + __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); + __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); - // color transform - __m128i yws = _mm_srli_epi16(yw, 4); - __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); - __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); - __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); - __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); - __m128i rws = _mm_add_epi16(cr0, yws); - __m128i gwt = _mm_add_epi16(cb0, yws); - __m128i bws = _mm_add_epi16(yws, cb1); - __m128i gws = _mm_add_epi16(gwt, cr1); + // color transform + __m128i yws = _mm_srli_epi16(yw, 4); + __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); + __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); + __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); + __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); + __m128i rws = _mm_add_epi16(cr0, yws); + __m128i gwt = _mm_add_epi16(cb0, yws); + __m128i bws = _mm_add_epi16(yws, cb1); + __m128i gws = _mm_add_epi16(gwt, cr1); - // descale - __m128i rw = _mm_srai_epi16(rws, 4); - __m128i bw = _mm_srai_epi16(bws, 4); - __m128i gw = _mm_srai_epi16(gws, 4); + // descale + __m128i rw = _mm_srai_epi16(rws, 4); + __m128i bw = _mm_srai_epi16(bws, 4); + __m128i gw = _mm_srai_epi16(gws, 4); - // back to byte, set up for transpose - __m128i brb = _mm_packus_epi16(rw, bw); - __m128i gxb = _mm_packus_epi16(gw, xw); + // back to byte, set up for transpose + __m128i brb = _mm_packus_epi16(rw, bw); + __m128i gxb = _mm_packus_epi16(gw, xw); - // transpose to interleave channels - __m128i t0 = _mm_unpacklo_epi8(brb, gxb); - __m128i t1 = _mm_unpackhi_epi8(brb, gxb); - __m128i o0 = _mm_unpacklo_epi16(t0, t1); - __m128i o1 = _mm_unpackhi_epi16(t0, t1); + // transpose to interleave channels + __m128i t0 = _mm_unpacklo_epi8(brb, gxb); + __m128i t1 = _mm_unpackhi_epi8(brb, gxb); + __m128i o0 = _mm_unpacklo_epi16(t0, t1); + __m128i o1 = _mm_unpackhi_epi16(t0, t1); - // store - _mm_storeu_si128((__m128i *) (out + 0), o0); - _mm_storeu_si128((__m128i *) (out + 16), o1); - out += 32; - } - } + // store + _mm_storeu_si128((__m128i *)(out + 0), o0); + _mm_storeu_si128((__m128i *)(out + 16), o1); + out += 32; + } + } #endif #ifdef STBI_NEON - // in this version, step=3 support would be easy to add. but is there demand? - if (step == 4) { - // this is a fairly straightforward implementation and not super-optimized. - uint8x8_t signflip = vdup_n_u8(0x80); - int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f)); - int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f)); - int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f)); - int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f)); + // in this version, step=3 support would be easy to add. but is there demand? + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + uint8x8_t signflip = vdup_n_u8(0x80); + int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f)); + int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f)); + int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f)); + int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f)); - for (; i+7 < count; i += 8) { - // load - uint8x8_t y_bytes = vld1_u8(y + i); - uint8x8_t cr_bytes = vld1_u8(pcr + i); - uint8x8_t cb_bytes = vld1_u8(pcb + i); - int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); - int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); + for (; i + 7 < count; i += 8) { + // load + uint8x8_t y_bytes = vld1_u8(y + i); + uint8x8_t cr_bytes = vld1_u8(pcr + i); + uint8x8_t cb_bytes = vld1_u8(pcb + i); + int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); + int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); - // expand to s16 - int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); - int16x8_t crw = vshll_n_s8(cr_biased, 7); - int16x8_t cbw = vshll_n_s8(cb_biased, 7); + // expand to s16 + int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); + int16x8_t crw = vshll_n_s8(cr_biased, 7); + int16x8_t cbw = vshll_n_s8(cb_biased, 7); - // color transform - int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); - int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); - int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); - int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); - int16x8_t rws = vaddq_s16(yws, cr0); - int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); - int16x8_t bws = vaddq_s16(yws, cb1); + // color transform + int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); + int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); + int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); + int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); + int16x8_t rws = vaddq_s16(yws, cr0); + int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); + int16x8_t bws = vaddq_s16(yws, cb1); - // undo scaling, round, convert to byte - uint8x8x4_t o; - o.val[0] = vqrshrun_n_s16(rws, 4); - o.val[1] = vqrshrun_n_s16(gws, 4); - o.val[2] = vqrshrun_n_s16(bws, 4); - o.val[3] = vdup_n_u8(255); + // undo scaling, round, convert to byte + uint8x8x4_t o; + o.val[0] = vqrshrun_n_s16(rws, 4); + o.val[1] = vqrshrun_n_s16(gws, 4); + o.val[2] = vqrshrun_n_s16(bws, 4); + o.val[3] = vdup_n_u8(255); - // store, interleaving r/g/b/a - vst4_u8(out, o); - out += 8*4; - } - } + // store, interleaving r/g/b/a + vst4_u8(out, o); + out += 8 * 4; + } + } #endif - for (; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1<<19); // rounding - int r,g,b; - int cr = pcr[i] - 128; - int cb = pcb[i] - 128; - r = y_fixed + cr* stbi__float2fixed(1.40200f); - g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000); - b = y_fixed + cb* stbi__float2fixed(1.77200f); - r >>= 20; - g >>= 20; - b >>= 20; - if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } - if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } - if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } - out[0] = (stbi_uc)r; - out[1] = (stbi_uc)g; - out[2] = (stbi_uc)b; - out[3] = 255; - out += step; - } + for (; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int r, g, b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr * stbi__float2fixed(1.40200f); + g = y_fixed + cr * -stbi__float2fixed(0.71414f) + ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb * stbi__float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned)r > 255) { + if (r < 0) + r = 0; + else + r = 255; + } + if ((unsigned)g > 255) { + if (g < 0) + g = 0; + else + g = 255; + } + if ((unsigned)b > 255) { + if (b < 0) + b = 0; + else + b = 255; + } + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; + out[3] = 255; + out += step; + } } #endif // set up the kernels -static void stbi__setup_jpeg(stbi__jpeg *j) -{ - j->idct_block_kernel = stbi__idct_block; - j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row; - j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; +static void stbi__setup_jpeg(stbi__jpeg * j) { + j->idct_block_kernel = stbi__idct_block; + j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row; + j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; #ifdef STBI_SSE2 - if (stbi__sse2_available()) { - j->idct_block_kernel = stbi__idct_simd; - j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; - j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; - } + if (stbi__sse2_available()) { + j->idct_block_kernel = stbi__idct_simd; + j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; + j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; + } #endif #ifdef STBI_NEON - j->idct_block_kernel = stbi__idct_simd; - j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; - j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; + j->idct_block_kernel = stbi__idct_simd; + j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; + j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; #endif } // clean up the temporary component buffers -static void stbi__cleanup_jpeg(stbi__jpeg *j) -{ - stbi__free_jpeg_components(j, j->s->img_n, 0); -} +static void stbi__cleanup_jpeg(stbi__jpeg * j) { stbi__free_jpeg_components(j, j->s->img_n, 0); } -typedef struct -{ - resample_row_func resample; - stbi_uc *line0,*line1; - int hs,vs; // expansion factor in each axis - int w_lores; // horizontal pixels pre-expansion - int ystep; // how far through vertical expansion we are - int ypos; // which pre-expansion row we're on +typedef struct { + resample_row_func resample; + stbi_uc *line0, *line1; + int hs, vs; // expansion factor in each axis + int w_lores; // horizontal pixels pre-expansion + int ystep; // how far through vertical expansion we are + int ypos; // which pre-expansion row we're on } stbi__resample; // fast 0..255 * 0..255 => 0..255 rounded multiplication -static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) -{ - unsigned int t = x*y + 128; - return (stbi_uc) ((t + (t >>8)) >> 8); +static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) { + unsigned int t = x * y + 128; + return (stbi_uc)((t + (t >> 8)) >> 8); } -static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp) -{ - int n, decode_n, is_rgb; - z->s->img_n = 0; // make stbi__cleanup_jpeg safe +static stbi_uc * load_jpeg_image(stbi__jpeg * z, int * out_x, int * out_y, int * comp, int req_comp) { + int n, decode_n, is_rgb; + z->s->img_n = 0; // make stbi__cleanup_jpeg safe - // validate req_comp - if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); + // validate req_comp + if (req_comp < 0 || req_comp > 4) + return stbi__errpuc("bad req_comp", "Internal error"); - // load a jpeg image from whichever source, but leave in YCbCr format - if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; } + // load a jpeg image from whichever source, but leave in YCbCr format + if (!stbi__decode_jpeg_image(z)) { + stbi__cleanup_jpeg(z); + return NULL; + } - // determine actual number of components to generate - n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; + // determine actual number of components to generate + n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; - is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); + is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); - if (z->s->img_n == 3 && n < 3 && !is_rgb) - decode_n = 1; - else - decode_n = z->s->img_n; + if (z->s->img_n == 3 && n < 3 && !is_rgb) + decode_n = 1; + else + decode_n = z->s->img_n; - // nothing to do if no components requested; check this now to avoid - // accessing uninitialized coutput[0] later - if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; } + // nothing to do if no components requested; check this now to avoid + // accessing uninitialized coutput[0] later + if (decode_n <= 0) { + stbi__cleanup_jpeg(z); + return NULL; + } - // resample and color-convert - { - int k; - unsigned int i,j; - stbi_uc *output; - stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL }; + // resample and color-convert + { + int k; + unsigned int i, j; + stbi_uc * output; + stbi_uc * coutput[4] = {NULL, NULL, NULL, NULL}; - stbi__resample res_comp[4]; + stbi__resample res_comp[4]; - for (k=0; k < decode_n; ++k) { - stbi__resample *r = &res_comp[k]; + for (k = 0; k < decode_n; ++k) { + stbi__resample * r = &res_comp[k]; - // allocate line buffer big enough for upsampling off the edges - // with upsample factor of 4 - z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3); - if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } - - r->hs = z->img_h_max / z->img_comp[k].h; - r->vs = z->img_v_max / z->img_comp[k].v; - r->ystep = r->vs >> 1; - r->w_lores = (z->s->img_x + r->hs-1) / r->hs; - r->ypos = 0; - r->line0 = r->line1 = z->img_comp[k].data; - - if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; - else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2; - else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2; - else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; - else r->resample = stbi__resample_row_generic; - } - - // can't error after this so, this is safe - output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); - if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } - - // now go ahead and resample - for (j=0; j < z->s->img_y; ++j) { - stbi_uc *out = output + n * z->s->img_x * j; - for (k=0; k < decode_n; ++k) { - stbi__resample *r = &res_comp[k]; - int y_bot = r->ystep >= (r->vs >> 1); - coutput[k] = r->resample(z->img_comp[k].linebuf, - y_bot ? r->line1 : r->line0, - y_bot ? r->line0 : r->line1, - r->w_lores, r->hs); - if (++r->ystep >= r->vs) { - r->ystep = 0; - r->line0 = r->line1; - if (++r->ypos < z->img_comp[k].y) - r->line1 += z->img_comp[k].w2; + // allocate line buffer big enough for upsampling off the edges + // with upsample factor of 4 + z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3); + if (!z->img_comp[k].linebuf) { + stbi__cleanup_jpeg(z); + return stbi__errpuc("outofmem", "Out of memory"); } - } - if (n >= 3) { - stbi_uc *y = coutput[0]; - if (z->s->img_n == 3) { - if (is_rgb) { - for (i=0; i < z->s->img_x; ++i) { - out[0] = y[i]; - out[1] = coutput[1][i]; - out[2] = coutput[2][i]; - out[3] = 255; - out += n; - } - } else { - z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); - } - } else if (z->s->img_n == 4) { - if (z->app14_color_transform == 0) { // CMYK - for (i=0; i < z->s->img_x; ++i) { - stbi_uc m = coutput[3][i]; - out[0] = stbi__blinn_8x8(coutput[0][i], m); - out[1] = stbi__blinn_8x8(coutput[1][i], m); - out[2] = stbi__blinn_8x8(coutput[2][i], m); - out[3] = 255; - out += n; - } - } else if (z->app14_color_transform == 2) { // YCCK - z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); - for (i=0; i < z->s->img_x; ++i) { - stbi_uc m = coutput[3][i]; - out[0] = stbi__blinn_8x8(255 - out[0], m); - out[1] = stbi__blinn_8x8(255 - out[1], m); - out[2] = stbi__blinn_8x8(255 - out[2], m); - out += n; - } - } else { // YCbCr + alpha? Ignore the fourth channel for now - z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); - } - } else - for (i=0; i < z->s->img_x; ++i) { - out[0] = out[1] = out[2] = y[i]; - out[3] = 255; // not used if n==3 - out += n; - } - } else { - if (is_rgb) { - if (n == 1) - for (i=0; i < z->s->img_x; ++i) - *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); - else { - for (i=0; i < z->s->img_x; ++i, out += 2) { - out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); - out[1] = 255; - } - } - } else if (z->s->img_n == 4 && z->app14_color_transform == 0) { - for (i=0; i < z->s->img_x; ++i) { - stbi_uc m = coutput[3][i]; - stbi_uc r = stbi__blinn_8x8(coutput[0][i], m); - stbi_uc g = stbi__blinn_8x8(coutput[1][i], m); - stbi_uc b = stbi__blinn_8x8(coutput[2][i], m); - out[0] = stbi__compute_y(r, g, b); - out[1] = 255; - out += n; - } - } else if (z->s->img_n == 4 && z->app14_color_transform == 2) { - for (i=0; i < z->s->img_x; ++i) { - out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]); - out[1] = 255; - out += n; - } + + r->hs = z->img_h_max / z->img_comp[k].h; + r->vs = z->img_v_max / z->img_comp[k].v; + r->ystep = r->vs >> 1; + r->w_lores = (z->s->img_x + r->hs - 1) / r->hs; + r->ypos = 0; + r->line0 = r->line1 = z->img_comp[k].data; + + if (r->hs == 1 && r->vs == 1) + r->resample = resample_row_1; + else if (r->hs == 1 && r->vs == 2) + r->resample = stbi__resample_row_v_2; + else if (r->hs == 2 && r->vs == 1) + r->resample = stbi__resample_row_h_2; + else if (r->hs == 2 && r->vs == 2) + r->resample = z->resample_row_hv_2_kernel; + else + r->resample = stbi__resample_row_generic; + } + + // can't error after this so, this is safe + output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); + if (!output) { + stbi__cleanup_jpeg(z); + return stbi__errpuc("outofmem", "Out of memory"); + } + + // now go ahead and resample + for (j = 0; j < z->s->img_y; ++j) { + stbi_uc * out = output + n * z->s->img_x * j; + for (k = 0; k < decode_n; ++k) { + stbi__resample * r = &res_comp[k]; + int y_bot = r->ystep >= (r->vs >> 1); + coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, y_bot ? r->line0 : r->line1, + r->w_lores, r->hs); + if (++r->ystep >= r->vs) { + r->ystep = 0; + r->line0 = r->line1; + if (++r->ypos < z->img_comp[k].y) + r->line1 += z->img_comp[k].w2; + } + } + if (n >= 3) { + stbi_uc * y = coutput[0]; + if (z->s->img_n == 3) { + if (is_rgb) { + for (i = 0; i < z->s->img_x; ++i) { + out[0] = y[i]; + out[1] = coutput[1][i]; + out[2] = coutput[2][i]; + out[3] = 255; + out += n; + } + } else { + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } + } else if (z->s->img_n == 4) { + if (z->app14_color_transform == 0) { // CMYK + for (i = 0; i < z->s->img_x; ++i) { + stbi_uc m = coutput[3][i]; + out[0] = stbi__blinn_8x8(coutput[0][i], m); + out[1] = stbi__blinn_8x8(coutput[1][i], m); + out[2] = stbi__blinn_8x8(coutput[2][i], m); + out[3] = 255; + out += n; + } + } else if (z->app14_color_transform == 2) { // YCCK + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + for (i = 0; i < z->s->img_x; ++i) { + stbi_uc m = coutput[3][i]; + out[0] = stbi__blinn_8x8(255 - out[0], m); + out[1] = stbi__blinn_8x8(255 - out[1], m); + out[2] = stbi__blinn_8x8(255 - out[2], m); + out += n; + } + } else { // YCbCr + alpha? Ignore the fourth channel for now + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } + } else + for (i = 0; i < z->s->img_x; ++i) { + out[0] = out[1] = out[2] = y[i]; + out[3] = 255; // not used if n==3 + out += n; + } } else { - stbi_uc *y = coutput[0]; - if (n == 1) - for (i=0; i < z->s->img_x; ++i) out[i] = y[i]; - else - for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; } + if (is_rgb) { + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) + *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); + else { + for (i = 0; i < z->s->img_x; ++i, out += 2) { + out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); + out[1] = 255; + } + } + } else if (z->s->img_n == 4 && z->app14_color_transform == 0) { + for (i = 0; i < z->s->img_x; ++i) { + stbi_uc m = coutput[3][i]; + stbi_uc r = stbi__blinn_8x8(coutput[0][i], m); + stbi_uc g = stbi__blinn_8x8(coutput[1][i], m); + stbi_uc b = stbi__blinn_8x8(coutput[2][i], m); + out[0] = stbi__compute_y(r, g, b); + out[1] = 255; + out += n; + } + } else if (z->s->img_n == 4 && z->app14_color_transform == 2) { + for (i = 0; i < z->s->img_x; ++i) { + out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]); + out[1] = 255; + out += n; + } + } else { + stbi_uc * y = coutput[0]; + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) + out[i] = y[i]; + else + for (i = 0; i < z->s->img_x; ++i) { + *out++ = y[i]; + *out++ = 255; + } + } } - } - } - stbi__cleanup_jpeg(z); - *out_x = z->s->img_x; - *out_y = z->s->img_y; - if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output - return output; - } + } + stbi__cleanup_jpeg(z); + *out_x = z->s->img_x; + *out_y = z->s->img_y; + if (comp) + *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output + return output; + } } -static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - unsigned char* result; - stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg)); - if (!j) return stbi__errpuc("outofmem", "Out of memory"); - memset(j, 0, sizeof(stbi__jpeg)); - STBI_NOTUSED(ri); - j->s = s; - stbi__setup_jpeg(j); - result = load_jpeg_image(j, x,y,comp,req_comp); - STBI_FREE(j); - return result; +static void * stbi__jpeg_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + unsigned char * result; + stbi__jpeg * j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg)); + if (!j) + return stbi__errpuc("outofmem", "Out of memory"); + memset(j, 0, sizeof(stbi__jpeg)); + STBI_NOTUSED(ri); + j->s = s; + stbi__setup_jpeg(j); + result = load_jpeg_image(j, x, y, comp, req_comp); + STBI_FREE(j); + return result; } -static int stbi__jpeg_test(stbi__context *s) -{ - int r; - stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); - if (!j) return stbi__err("outofmem", "Out of memory"); - memset(j, 0, sizeof(stbi__jpeg)); - j->s = s; - stbi__setup_jpeg(j); - r = stbi__decode_jpeg_header(j, STBI__SCAN_type); - stbi__rewind(s); - STBI_FREE(j); - return r; +static int stbi__jpeg_test(stbi__context * s) { + int r; + stbi__jpeg * j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg)); + if (!j) + return stbi__err("outofmem", "Out of memory"); + memset(j, 0, sizeof(stbi__jpeg)); + j->s = s; + stbi__setup_jpeg(j); + r = stbi__decode_jpeg_header(j, STBI__SCAN_type); + stbi__rewind(s); + STBI_FREE(j); + return r; } -static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp) -{ - if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) { - stbi__rewind( j->s ); - return 0; - } - if (x) *x = j->s->img_x; - if (y) *y = j->s->img_y; - if (comp) *comp = j->s->img_n >= 3 ? 3 : 1; - return 1; +static int stbi__jpeg_info_raw(stbi__jpeg * j, int * x, int * y, int * comp) { + if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) { + stbi__rewind(j->s); + return 0; + } + if (x) + *x = j->s->img_x; + if (y) + *y = j->s->img_y; + if (comp) + *comp = j->s->img_n >= 3 ? 3 : 1; + return 1; } -static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) -{ - int result; - stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg))); - if (!j) return stbi__err("outofmem", "Out of memory"); - memset(j, 0, sizeof(stbi__jpeg)); - j->s = s; - result = stbi__jpeg_info_raw(j, x, y, comp); - STBI_FREE(j); - return result; +static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp) { + int result; + stbi__jpeg * j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg))); + if (!j) + return stbi__err("outofmem", "Out of memory"); + memset(j, 0, sizeof(stbi__jpeg)); + j->s = s; + result = stbi__jpeg_info_raw(j, x, y, comp); + STBI_FREE(j); + return result; } #endif @@ -4088,84 +4278,81 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) #ifndef STBI_NO_ZLIB // fast-way is faster to check than jpeg huffman, but slow way is slower -#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables -#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) +#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables +#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) #define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet // zlib-style huffman encoding // (jpegs packs from left, zlib from right, so can't share code) -typedef struct -{ - stbi__uint16 fast[1 << STBI__ZFAST_BITS]; - stbi__uint16 firstcode[16]; - int maxcode[17]; - stbi__uint16 firstsymbol[16]; - stbi_uc size[STBI__ZNSYMS]; - stbi__uint16 value[STBI__ZNSYMS]; +typedef struct { + stbi__uint16 fast[1 << STBI__ZFAST_BITS]; + stbi__uint16 firstcode[16]; + int maxcode[17]; + stbi__uint16 firstsymbol[16]; + stbi_uc size[STBI__ZNSYMS]; + stbi__uint16 value[STBI__ZNSYMS]; } stbi__zhuffman; -stbi_inline static int stbi__bitreverse16(int n) -{ - n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); - n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); - n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); - n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); - return n; +stbi_inline static int stbi__bitreverse16(int n) { + n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); + n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); + n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); + n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); + return n; } -stbi_inline static int stbi__bit_reverse(int v, int bits) -{ - STBI_ASSERT(bits <= 16); - // to bit reverse n bits, reverse 16 and shift - // e.g. 11 bits, bit reverse and shift away 5 - return stbi__bitreverse16(v) >> (16-bits); +stbi_inline static int stbi__bit_reverse(int v, int bits) { + STBI_ASSERT(bits <= 16); + // to bit reverse n bits, reverse 16 and shift + // e.g. 11 bits, bit reverse and shift away 5 + return stbi__bitreverse16(v) >> (16 - bits); } -static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num) -{ - int i,k=0; - int code, next_code[16], sizes[17]; +static int stbi__zbuild_huffman(stbi__zhuffman * z, const stbi_uc * sizelist, int num) { + int i, k = 0; + int code, next_code[16], sizes[17]; - // DEFLATE spec for generating codes - memset(sizes, 0, sizeof(sizes)); - memset(z->fast, 0, sizeof(z->fast)); - for (i=0; i < num; ++i) - ++sizes[sizelist[i]]; - sizes[0] = 0; - for (i=1; i < 16; ++i) - if (sizes[i] > (1 << i)) - return stbi__err("bad sizes", "Corrupt PNG"); - code = 0; - for (i=1; i < 16; ++i) { - next_code[i] = code; - z->firstcode[i] = (stbi__uint16) code; - z->firstsymbol[i] = (stbi__uint16) k; - code = (code + sizes[i]); - if (sizes[i]) - if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG"); - z->maxcode[i] = code << (16-i); // preshift for inner loop - code <<= 1; - k += sizes[i]; - } - z->maxcode[16] = 0x10000; // sentinel - for (i=0; i < num; ++i) { - int s = sizelist[i]; - if (s) { - int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; - stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i); - z->size [c] = (stbi_uc ) s; - z->value[c] = (stbi__uint16) i; - if (s <= STBI__ZFAST_BITS) { - int j = stbi__bit_reverse(next_code[s],s); - while (j < (1 << STBI__ZFAST_BITS)) { - z->fast[j] = fastv; - j += (1 << s); + // DEFLATE spec for generating codes + memset(sizes, 0, sizeof(sizes)); + memset(z->fast, 0, sizeof(z->fast)); + for (i = 0; i < num; ++i) + ++sizes[sizelist[i]]; + sizes[0] = 0; + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) + return stbi__err("bad sizes", "Corrupt PNG"); + code = 0; + for (i = 1; i < 16; ++i) { + next_code[i] = code; + z->firstcode[i] = (stbi__uint16)code; + z->firstsymbol[i] = (stbi__uint16)k; + code = (code + sizes[i]); + if (sizes[i]) + if (code - 1 >= (1 << i)) + return stbi__err("bad codelengths", "Corrupt PNG"); + z->maxcode[i] = code << (16 - i); // preshift for inner loop + code <<= 1; + k += sizes[i]; + } + z->maxcode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) { + int s = sizelist[i]; + if (s) { + int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; + stbi__uint16 fastv = (stbi__uint16)((s << 9) | i); + z->size[c] = (stbi_uc)s; + z->value[c] = (stbi__uint16)i; + if (s <= STBI__ZFAST_BITS) { + int j = stbi__bit_reverse(next_code[s], s); + while (j < (1 << STBI__ZFAST_BITS)) { + z->fast[j] = fastv; + j += (1 << s); + } } - } - ++next_code[s]; - } - } - return 1; + ++next_code[s]; + } + } + return 1; } // zlib-from-memory implementation for PNG reading @@ -4174,297 +4361,298 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int // we require PNG read all the IDATs and combine them into a single // memory buffer -typedef struct -{ - stbi_uc *zbuffer, *zbuffer_end; - int num_bits; - int hit_zeof_once; - stbi__uint32 code_buffer; +typedef struct { + stbi_uc *zbuffer, *zbuffer_end; + int num_bits; + stbi__uint32 code_buffer; - char *zout; - char *zout_start; - char *zout_end; - int z_expandable; + char * zout; + char * zout_start; + char * zout_end; + int z_expandable; - stbi__zhuffman z_length, z_distance; + stbi__zhuffman z_length, z_distance; } stbi__zbuf; -stbi_inline static int stbi__zeof(stbi__zbuf *z) -{ - return (z->zbuffer >= z->zbuffer_end); +stbi_inline static int stbi__zeof(stbi__zbuf * z) { return (z->zbuffer >= z->zbuffer_end); } + +stbi_inline static stbi_uc stbi__zget8(stbi__zbuf * z) { return stbi__zeof(z) ? 0 : *z->zbuffer++; } + +static void stbi__fill_bits(stbi__zbuf * z) { + do { + if (z->code_buffer >= (1U << z->num_bits)) { + z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */ + return; + } + z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits; + z->num_bits += 8; + } while (z->num_bits <= 24); } -stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z) -{ - return stbi__zeof(z) ? 0 : *z->zbuffer++; +stbi_inline static unsigned int stbi__zreceive(stbi__zbuf * z, int n) { + unsigned int k; + if (z->num_bits < n) + stbi__fill_bits(z); + k = z->code_buffer & ((1 << n) - 1); + z->code_buffer >>= n; + z->num_bits -= n; + return k; } -static void stbi__fill_bits(stbi__zbuf *z) -{ - do { - if (z->code_buffer >= (1U << z->num_bits)) { - z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */ - return; - } - z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits; - z->num_bits += 8; - } while (z->num_bits <= 24); +static int stbi__zhuffman_decode_slowpath(stbi__zbuf * a, stbi__zhuffman * z) { + int b, s, k; + // not resolved by fast table, so compute it the slow way + // use jpeg approach, which requires MSbits at top + k = stbi__bit_reverse(a->code_buffer, 16); + for (s = STBI__ZFAST_BITS + 1;; ++s) + if (k < z->maxcode[s]) + break; + if (s >= 16) + return -1; // invalid code! + // code size is s, so: + b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s]; + if (b >= STBI__ZNSYMS) + return -1; // some data was corrupt somewhere! + if (z->size[b] != s) + return -1; // was originally an assert, but report failure instead. + a->code_buffer >>= s; + a->num_bits -= s; + return z->value[b]; } -stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n) -{ - unsigned int k; - if (z->num_bits < n) stbi__fill_bits(z); - k = z->code_buffer & ((1 << n) - 1); - z->code_buffer >>= n; - z->num_bits -= n; - return k; +stbi_inline static int stbi__zhuffman_decode(stbi__zbuf * a, stbi__zhuffman * z) { + int b, s; + if (a->num_bits < 16) { + if (stbi__zeof(a)) { + return -1; /* report error for unexpected end of data. */ + } + stbi__fill_bits(a); + } + b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; + if (b) { + s = b >> 9; + a->code_buffer >>= s; + a->num_bits -= s; + return b & 511; + } + return stbi__zhuffman_decode_slowpath(a, z); } -static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) +static int stbi__zexpand(stbi__zbuf * z, char * zout, int n) // need to make room for n bytes { - int b,s,k; - // not resolved by fast table, so compute it the slow way - // use jpeg approach, which requires MSbits at top - k = stbi__bit_reverse(a->code_buffer, 16); - for (s=STBI__ZFAST_BITS+1; ; ++s) - if (k < z->maxcode[s]) - break; - if (s >= 16) return -1; // invalid code! - // code size is s, so: - b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s]; - if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere! - if (z->size[b] != s) return -1; // was originally an assert, but report failure instead. - a->code_buffer >>= s; - a->num_bits -= s; - return z->value[b]; + char * q; + unsigned int cur, limit, old_limit; + z->zout = zout; + if (!z->z_expandable) + return stbi__err("output buffer limit", "Corrupt PNG"); + cur = (unsigned int)(z->zout - z->zout_start); + limit = old_limit = (unsigned)(z->zout_end - z->zout_start); + if (UINT_MAX - cur < (unsigned)n) + return stbi__err("outofmem", "Out of memory"); + while (cur + n > limit) { + if (limit > UINT_MAX / 2) + return stbi__err("outofmem", "Out of memory"); + limit *= 2; + } + q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); + STBI_NOTUSED(old_limit); + if (q == NULL) + return stbi__err("outofmem", "Out of memory"); + z->zout_start = q; + z->zout = q + cur; + z->zout_end = q + limit; + return 1; } -stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) -{ - int b,s; - if (a->num_bits < 16) { - if (stbi__zeof(a)) { - if (!a->hit_zeof_once) { - // This is the first time we hit eof, insert 16 extra padding btis - // to allow us to keep going; if we actually consume any of them - // though, that is invalid data. This is caught later. - a->hit_zeof_once = 1; - a->num_bits += 16; // add 16 implicit zero bits - } else { - // We already inserted our extra 16 padding bits and are again - // out, this stream is actually prematurely terminated. - return -1; - } - } else { - stbi__fill_bits(a); - } - } - b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; - if (b) { - s = b >> 9; - a->code_buffer >>= s; - a->num_bits -= s; - return b & 511; - } - return stbi__zhuffman_decode_slowpath(a, z); -} +static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; -static int stbi__zexpand(stbi__zbuf *z, char *zout, int n) // need to make room for n bytes -{ - char *q; - unsigned int cur, limit, old_limit; - z->zout = zout; - if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG"); - cur = (unsigned int) (z->zout - z->zout_start); - limit = old_limit = (unsigned) (z->zout_end - z->zout_start); - if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory"); - while (cur + n > limit) { - if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory"); - limit *= 2; - } - q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); - STBI_NOTUSED(old_limit); - if (q == NULL) return stbi__err("outofmem", "Out of memory"); - z->zout_start = q; - z->zout = q + cur; - z->zout_end = q + limit; - return 1; -} +static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0}; -static const int stbi__zlength_base[31] = { - 3,4,5,6,7,8,9,10,11,13, - 15,17,19,23,27,31,35,43,51,59, - 67,83,99,115,131,163,195,227,258,0,0 }; +static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; -static const int stbi__zlength_extra[31]= -{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 }; +static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; -static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, -257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0}; - -static const int stbi__zdist_extra[32] = -{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13}; - -static int stbi__parse_huffman_block(stbi__zbuf *a) -{ - char *zout = a->zout; - for(;;) { - int z = stbi__zhuffman_decode(a, &a->z_length); - if (z < 256) { - if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes - if (zout >= a->zout_end) { - if (!stbi__zexpand(a, zout, 1)) return 0; - zout = a->zout; - } - *zout++ = (char) z; - } else { - stbi_uc *p; - int len,dist; - if (z == 256) { - a->zout = zout; - if (a->hit_zeof_once && a->num_bits < 16) { - // The first time we hit zeof, we inserted 16 extra zero bits into our bit - // buffer so the decoder can just do its speculative decoding. But if we - // actually consumed any of those bits (which is the case when num_bits < 16), - // the stream actually read past the end so it is malformed. - return stbi__err("unexpected end","Corrupt PNG"); +static int stbi__parse_huffman_block(stbi__zbuf * a) { + char * zout = a->zout; + for (;;) { + int z = stbi__zhuffman_decode(a, &a->z_length); + if (z < 256) { + if (z < 0) + return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes + if (zout >= a->zout_end) { + if (!stbi__zexpand(a, zout, 1)) + return 0; + zout = a->zout; } - return 1; - } - if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data - z -= 257; - len = stbi__zlength_base[z]; - if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]); - z = stbi__zhuffman_decode(a, &a->z_distance); - if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data - dist = stbi__zdist_base[z]; - if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]); - if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG"); - if (len > a->zout_end - zout) { - if (!stbi__zexpand(a, zout, len)) return 0; - zout = a->zout; - } - p = (stbi_uc *) (zout - dist); - if (dist == 1) { // run of one byte; common in images. - stbi_uc v = *p; - if (len) { do *zout++ = v; while (--len); } - } else { - if (len) { do *zout++ = *p++; while (--len); } - } - } - } + *zout++ = (char)z; + } else { + stbi_uc * p; + int len, dist; + if (z == 256) { + a->zout = zout; + return 1; + } + if (z >= 286) + return stbi__err("bad huffman code", + "Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data + z -= 257; + len = stbi__zlength_base[z]; + if (stbi__zlength_extra[z]) + len += stbi__zreceive(a, stbi__zlength_extra[z]); + z = stbi__zhuffman_decode(a, &a->z_distance); + if (z < 0 || z >= 30) + return stbi__err("bad huffman code", + "Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data + dist = stbi__zdist_base[z]; + if (stbi__zdist_extra[z]) + dist += stbi__zreceive(a, stbi__zdist_extra[z]); + if (zout - a->zout_start < dist) + return stbi__err("bad dist", "Corrupt PNG"); + if (zout + len > a->zout_end) { + if (!stbi__zexpand(a, zout, len)) + return 0; + zout = a->zout; + } + p = (stbi_uc *)(zout - dist); + if (dist == 1) { // run of one byte; common in images. + stbi_uc v = *p; + if (len) { + do + *zout++ = v; + while (--len); + } + } else { + if (len) { + do + *zout++ = *p++; + while (--len); + } + } + } + } } -static int stbi__compute_huffman_codes(stbi__zbuf *a) -{ - static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 }; - stbi__zhuffman z_codelength; - stbi_uc lencodes[286+32+137];//padding for maximum single op - stbi_uc codelength_sizes[19]; - int i,n; +static int stbi__compute_huffman_codes(stbi__zbuf * a) { + static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + stbi__zhuffman z_codelength; + stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op + stbi_uc codelength_sizes[19]; + int i, n; - int hlit = stbi__zreceive(a,5) + 257; - int hdist = stbi__zreceive(a,5) + 1; - int hclen = stbi__zreceive(a,4) + 4; - int ntot = hlit + hdist; + int hlit = stbi__zreceive(a, 5) + 257; + int hdist = stbi__zreceive(a, 5) + 1; + int hclen = stbi__zreceive(a, 4) + 4; + int ntot = hlit + hdist; - memset(codelength_sizes, 0, sizeof(codelength_sizes)); - for (i=0; i < hclen; ++i) { - int s = stbi__zreceive(a,3); - codelength_sizes[length_dezigzag[i]] = (stbi_uc) s; - } - if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0; + memset(codelength_sizes, 0, sizeof(codelength_sizes)); + for (i = 0; i < hclen; ++i) { + int s = stbi__zreceive(a, 3); + codelength_sizes[length_dezigzag[i]] = (stbi_uc)s; + } + if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) + return 0; - n = 0; - while (n < ntot) { - int c = stbi__zhuffman_decode(a, &z_codelength); - if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG"); - if (c < 16) - lencodes[n++] = (stbi_uc) c; - else { - stbi_uc fill = 0; - if (c == 16) { - c = stbi__zreceive(a,2)+3; - if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG"); - fill = lencodes[n-1]; - } else if (c == 17) { - c = stbi__zreceive(a,3)+3; - } else if (c == 18) { - c = stbi__zreceive(a,7)+11; - } else { + n = 0; + while (n < ntot) { + int c = stbi__zhuffman_decode(a, &z_codelength); + if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG"); - } - if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG"); - memset(lencodes+n, fill, c); - n += c; - } - } - if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG"); - if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0; - if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0; - return 1; + if (c < 16) + lencodes[n++] = (stbi_uc)c; + else { + stbi_uc fill = 0; + if (c == 16) { + c = stbi__zreceive(a, 2) + 3; + if (n == 0) + return stbi__err("bad codelengths", "Corrupt PNG"); + fill = lencodes[n - 1]; + } else if (c == 17) { + c = stbi__zreceive(a, 3) + 3; + } else if (c == 18) { + c = stbi__zreceive(a, 7) + 11; + } else { + return stbi__err("bad codelengths", "Corrupt PNG"); + } + if (ntot - n < c) + return stbi__err("bad codelengths", "Corrupt PNG"); + memset(lencodes + n, fill, c); + n += c; + } + } + if (n != ntot) + return stbi__err("bad codelengths", "Corrupt PNG"); + if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) + return 0; + if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) + return 0; + return 1; } -static int stbi__parse_uncompressed_block(stbi__zbuf *a) -{ - stbi_uc header[4]; - int len,nlen,k; - if (a->num_bits & 7) - stbi__zreceive(a, a->num_bits & 7); // discard - // drain the bit-packed data into header - k = 0; - while (a->num_bits > 0) { - header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check - a->code_buffer >>= 8; - a->num_bits -= 8; - } - if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG"); - // now fill header the normal way - while (k < 4) - header[k++] = stbi__zget8(a); - len = header[1] * 256 + header[0]; - nlen = header[3] * 256 + header[2]; - if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG"); - if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG"); - if (a->zout + len > a->zout_end) - if (!stbi__zexpand(a, a->zout, len)) return 0; - memcpy(a->zout, a->zbuffer, len); - a->zbuffer += len; - a->zout += len; - return 1; +static int stbi__parse_uncompressed_block(stbi__zbuf * a) { + stbi_uc header[4]; + int len, nlen, k; + if (a->num_bits & 7) + stbi__zreceive(a, a->num_bits & 7); // discard + // drain the bit-packed data into header + k = 0; + while (a->num_bits > 0) { + header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check + a->code_buffer >>= 8; + a->num_bits -= 8; + } + if (a->num_bits < 0) + return stbi__err("zlib corrupt", "Corrupt PNG"); + // now fill header the normal way + while (k < 4) + header[k++] = stbi__zget8(a); + len = header[1] * 256 + header[0]; + nlen = header[3] * 256 + header[2]; + if (nlen != (len ^ 0xffff)) + return stbi__err("zlib corrupt", "Corrupt PNG"); + if (a->zbuffer + len > a->zbuffer_end) + return stbi__err("read past buffer", "Corrupt PNG"); + if (a->zout + len > a->zout_end) + if (!stbi__zexpand(a, a->zout, len)) + return 0; + memcpy(a->zout, a->zbuffer, len); + a->zbuffer += len; + a->zout += len; + return 1; } -static int stbi__parse_zlib_header(stbi__zbuf *a) -{ - int cmf = stbi__zget8(a); - int cm = cmf & 15; - /* int cinfo = cmf >> 4; */ - int flg = stbi__zget8(a); - if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec - if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec - if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png - if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png - // window = 1 << (8 + cinfo)... but who cares, we fully buffer output - return 1; +static int stbi__parse_zlib_header(stbi__zbuf * a) { + int cmf = stbi__zget8(a); + int cm = cmf & 15; + /* int cinfo = cmf >> 4; */ + int flg = stbi__zget8(a); + if (stbi__zeof(a)) + return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec + if ((cmf * 256 + flg) % 31 != 0) + return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec + if (flg & 32) + return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png + if (cm != 8) + return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png + // window = 1 << (8 + cinfo)... but who cares, we fully buffer output + return 1; } -static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = -{ - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8 -}; -static const stbi_uc stbi__zdefault_distance[32] = -{ - 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 -}; +static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8}; +static const stbi_uc stbi__zdefault_distance[32] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; /* Init algorithm: { @@ -4478,118 +4666,122 @@ Init algorithm: } */ -static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) -{ - int final, type; - if (parse_header) - if (!stbi__parse_zlib_header(a)) return 0; - a->num_bits = 0; - a->code_buffer = 0; - a->hit_zeof_once = 0; - do { - final = stbi__zreceive(a,1); - type = stbi__zreceive(a,2); - if (type == 0) { - if (!stbi__parse_uncompressed_block(a)) return 0; - } else if (type == 3) { - return 0; - } else { - if (type == 1) { - // use fixed code lengths - if (!stbi__zbuild_huffman(&a->z_length , stbi__zdefault_length , STBI__ZNSYMS)) return 0; - if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) return 0; - } else { - if (!stbi__compute_huffman_codes(a)) return 0; - } - if (!stbi__parse_huffman_block(a)) return 0; - } - } while (!final); - return 1; +static int stbi__parse_zlib(stbi__zbuf * a, int parse_header) { + int final, type; + if (parse_header) + if (!stbi__parse_zlib_header(a)) + return 0; + a->num_bits = 0; + a->code_buffer = 0; + do { + final = stbi__zreceive(a, 1); + type = stbi__zreceive(a, 2); + if (type == 0) { + if (!stbi__parse_uncompressed_block(a)) + return 0; + } else if (type == 3) { + return 0; + } else { + if (type == 1) { + // use fixed code lengths + if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, STBI__ZNSYMS)) + return 0; + if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) + return 0; + } else { + if (!stbi__compute_huffman_codes(a)) + return 0; + } + if (!stbi__parse_huffman_block(a)) + return 0; + } + } while (!final); + return 1; } -static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header) -{ - a->zout_start = obuf; - a->zout = obuf; - a->zout_end = obuf + olen; - a->z_expandable = exp; +static int stbi__do_zlib(stbi__zbuf * a, char * obuf, int olen, int exp, int parse_header) { + a->zout_start = obuf; + a->zout = obuf; + a->zout_end = obuf + olen; + a->z_expandable = exp; - return stbi__parse_zlib(a, parse_header); + return stbi__parse_zlib(a, parse_header); } -STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen) -{ - stbi__zbuf a; - char *p = (char *) stbi__malloc(initial_size); - if (p == NULL) return NULL; - a.zbuffer = (stbi_uc *) buffer; - a.zbuffer_end = (stbi_uc *) buffer + len; - if (stbi__do_zlib(&a, p, initial_size, 1, 1)) { - if (outlen) *outlen = (int) (a.zout - a.zout_start); - return a.zout_start; - } else { - STBI_FREE(a.zout_start); - return NULL; - } +STBIDEF char * stbi_zlib_decode_malloc_guesssize(const char * buffer, int len, int initial_size, int * outlen) { + stbi__zbuf a; + char * p = (char *)stbi__malloc(initial_size); + if (p == NULL) + return NULL; + a.zbuffer = (stbi_uc *)buffer; + a.zbuffer_end = (stbi_uc *)buffer + len; + if (stbi__do_zlib(&a, p, initial_size, 1, 1)) { + if (outlen) + *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } else { + STBI_FREE(a.zout_start); + return NULL; + } } -STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen) -{ - return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen); +STBIDEF char * stbi_zlib_decode_malloc(char const * buffer, int len, int * outlen) { + return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen); } -STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header) -{ - stbi__zbuf a; - char *p = (char *) stbi__malloc(initial_size); - if (p == NULL) return NULL; - a.zbuffer = (stbi_uc *) buffer; - a.zbuffer_end = (stbi_uc *) buffer + len; - if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) { - if (outlen) *outlen = (int) (a.zout - a.zout_start); - return a.zout_start; - } else { - STBI_FREE(a.zout_start); - return NULL; - } +STBIDEF char * stbi_zlib_decode_malloc_guesssize_headerflag(const char * buffer, int len, int initial_size, int * outlen, + int parse_header) { + stbi__zbuf a; + char * p = (char *)stbi__malloc(initial_size); + if (p == NULL) + return NULL; + a.zbuffer = (stbi_uc *)buffer; + a.zbuffer_end = (stbi_uc *)buffer + len; + if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) { + if (outlen) + *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } else { + STBI_FREE(a.zout_start); + return NULL; + } } -STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen) -{ - stbi__zbuf a; - a.zbuffer = (stbi_uc *) ibuffer; - a.zbuffer_end = (stbi_uc *) ibuffer + ilen; - if (stbi__do_zlib(&a, obuffer, olen, 0, 1)) - return (int) (a.zout - a.zout_start); - else - return -1; +STBIDEF int stbi_zlib_decode_buffer(char * obuffer, int olen, char const * ibuffer, int ilen) { + stbi__zbuf a; + a.zbuffer = (stbi_uc *)ibuffer; + a.zbuffer_end = (stbi_uc *)ibuffer + ilen; + if (stbi__do_zlib(&a, obuffer, olen, 0, 1)) + return (int)(a.zout - a.zout_start); + else + return -1; } -STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen) -{ - stbi__zbuf a; - char *p = (char *) stbi__malloc(16384); - if (p == NULL) return NULL; - a.zbuffer = (stbi_uc *) buffer; - a.zbuffer_end = (stbi_uc *) buffer+len; - if (stbi__do_zlib(&a, p, 16384, 1, 0)) { - if (outlen) *outlen = (int) (a.zout - a.zout_start); - return a.zout_start; - } else { - STBI_FREE(a.zout_start); - return NULL; - } +STBIDEF char * stbi_zlib_decode_noheader_malloc(char const * buffer, int len, int * outlen) { + stbi__zbuf a; + char * p = (char *)stbi__malloc(16384); + if (p == NULL) + return NULL; + a.zbuffer = (stbi_uc *)buffer; + a.zbuffer_end = (stbi_uc *)buffer + len; + if (stbi__do_zlib(&a, p, 16384, 1, 0)) { + if (outlen) + *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } else { + STBI_FREE(a.zout_start); + return NULL; + } } -STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen) -{ - stbi__zbuf a; - a.zbuffer = (stbi_uc *) ibuffer; - a.zbuffer_end = (stbi_uc *) ibuffer + ilen; - if (stbi__do_zlib(&a, obuffer, olen, 0, 0)) - return (int) (a.zout - a.zout_start); - else - return -1; +STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const char * ibuffer, int ilen) { + stbi__zbuf a; + a.zbuffer = (stbi_uc *)ibuffer; + a.zbuffer_end = (stbi_uc *)ibuffer + ilen; + if (stbi__do_zlib(&a, obuffer, olen, 0, 0)) + return (int)(a.zout - a.zout_start); + else + return -1; } #endif @@ -4604,1131 +4796,1303 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char // - uses stb_zlib, a PD zlib implementation with fast huffman decoding #ifndef STBI_NO_PNG -typedef struct -{ - stbi__uint32 length; - stbi__uint32 type; +typedef struct { + stbi__uint32 length; + stbi__uint32 type; } stbi__pngchunk; -static stbi__pngchunk stbi__get_chunk_header(stbi__context *s) -{ - stbi__pngchunk c; - c.length = stbi__get32be(s); - c.type = stbi__get32be(s); - return c; +static stbi__pngchunk stbi__get_chunk_header(stbi__context * s) { + stbi__pngchunk c; + c.length = stbi__get32be(s); + c.type = stbi__get32be(s); + return c; } -static int stbi__check_png_header(stbi__context *s) -{ - static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 }; - int i; - for (i=0; i < 8; ++i) - if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG"); - return 1; +static int stbi__check_png_header(stbi__context * s) { + static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10}; + int i; + for (i = 0; i < 8; ++i) + if (stbi__get8(s) != png_sig[i]) + return stbi__err("bad png sig", "Not a PNG"); + return 1; } -typedef struct -{ - stbi__context *s; - stbi_uc *idata, *expanded, *out; - int depth; +typedef struct { + stbi__context * s; + stbi_uc *idata, *expanded, *out; + int depth; } stbi__png; - enum { - STBI__F_none=0, - STBI__F_sub=1, - STBI__F_up=2, - STBI__F_avg=3, - STBI__F_paeth=4, - // synthetic filter used for first scanline to avoid needing a dummy row of 0s - STBI__F_avg_first + STBI__F_none = 0, + STBI__F_sub = 1, + STBI__F_up = 2, + STBI__F_avg = 3, + STBI__F_paeth = 4, + // synthetic filters used for first scanline to avoid needing a dummy row of 0s + STBI__F_avg_first, + STBI__F_paeth_first }; -static stbi_uc first_row_filter[5] = -{ - STBI__F_none, - STBI__F_sub, - STBI__F_none, - STBI__F_avg_first, - STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub -}; +static stbi_uc first_row_filter[5] = {STBI__F_none, STBI__F_sub, STBI__F_none, STBI__F_avg_first, STBI__F_paeth_first}; -static int stbi__paeth(int a, int b, int c) -{ - // This formulation looks very different from the reference in the PNG spec, but is - // actually equivalent and has favorable data dependencies and admits straightforward - // generation of branch-free code, which helps performance significantly. - int thresh = c*3 - (a + b); - int lo = a < b ? a : b; - int hi = a < b ? b : a; - int t0 = (hi <= thresh) ? lo : c; - int t1 = (thresh <= lo) ? hi : t0; - return t1; +static int stbi__paeth(int a, int b, int c) { + int p = a + b - c; + int pa = abs(p - a); + int pb = abs(p - b); + int pc = abs(p - c); + if (pa <= pb && pa <= pc) + return a; + if (pb <= pc) + return b; + return c; } -static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 }; - -// adds an extra all-255 alpha channel -// dest == src is legal -// img_n must be 1 or 3 -static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n) -{ - int i; - // must process data backwards since we allow dest==src - if (img_n == 1) { - for (i=x-1; i >= 0; --i) { - dest[i*2+1] = 255; - dest[i*2+0] = src[i]; - } - } else { - STBI_ASSERT(img_n == 3); - for (i=x-1; i >= 0; --i) { - dest[i*4+3] = 255; - dest[i*4+2] = src[i*3+2]; - dest[i*4+1] = src[i*3+1]; - dest[i*4+0] = src[i*3+0]; - } - } -} +static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0, 0x11, 0, 0, 0, 0x01}; // create the png data from post-deflated data -static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color) -{ - int bytes = (depth == 16 ? 2 : 1); - stbi__context *s = a->s; - stbi__uint32 i,j,stride = x*out_n*bytes; - stbi__uint32 img_len, img_width_bytes; - stbi_uc *filter_buf; - int all_ok = 1; - int k; - int img_n = s->img_n; // copy it into a local for later +static int stbi__create_png_image_raw(stbi__png * a, stbi_uc * raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, + stbi__uint32 y, int depth, int color) { + int bytes = (depth == 16 ? 2 : 1); + stbi__context * s = a->s; + stbi__uint32 i, j, stride = x * out_n * bytes; + stbi__uint32 img_len, img_width_bytes; + int k; + int img_n = s->img_n; // copy it into a local for later - int output_bytes = out_n*bytes; - int filter_bytes = img_n*bytes; - int width = x; + int output_bytes = out_n * bytes; + int filter_bytes = img_n * bytes; + int width = x; - STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1); - a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into - if (!a->out) return stbi__err("outofmem", "Out of memory"); + STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1); + a->out = (stbi_uc *)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) + return stbi__err("outofmem", "Out of memory"); - // note: error exits here don't need to clean up a->out individually, - // stbi__do_png always does on error. - if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG"); - img_width_bytes = (((img_n * x * depth) + 7) >> 3); - if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG"); - img_len = (img_width_bytes + 1) * y; + if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) + return stbi__err("too large", "Corrupt PNG"); + img_width_bytes = (((img_n * x * depth) + 7) >> 3); + img_len = (img_width_bytes + 1) * y; - // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, - // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), - // so just check for raw_len < img_len always. - if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG"); + // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, + // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), + // so just check for raw_len < img_len always. + if (raw_len < img_len) + return stbi__err("not enough pixels", "Corrupt PNG"); - // Allocate two scan lines worth of filter workspace buffer. - filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0); - if (!filter_buf) return stbi__err("outofmem", "Out of memory"); + for (j = 0; j < y; ++j) { + stbi_uc * cur = a->out + stride * j; + stbi_uc * prior; + int filter = *raw++; - // Filtering for low-bit-depth images - if (depth < 8) { - filter_bytes = 1; - width = img_width_bytes; - } + if (filter > 4) + return stbi__err("invalid filter", "Corrupt PNG"); - for (j=0; j < y; ++j) { - // cur/prior filter buffers alternate - stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes; - stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes; - stbi_uc *dest = a->out + stride*j; - int nk = width * filter_bytes; - int filter = *raw++; + if (depth < 8) { + if (img_width_bytes > x) + return stbi__err("invalid width", "Corrupt PNG"); + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + filter_bytes = 1; + width = img_width_bytes; + } + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above - // check filter type - if (filter > 4) { - all_ok = stbi__err("invalid filter","Corrupt PNG"); - break; - } + // if first row, use special filter that doesn't sample previous row + if (j == 0) + filter = first_row_filter[filter]; - // if first row, use special filter that doesn't sample previous row - if (j == 0) filter = first_row_filter[filter]; - - // perform actual filtering - switch (filter) { - case STBI__F_none: - memcpy(cur, raw, nk); - break; - case STBI__F_sub: - memcpy(cur, raw, filter_bytes); - for (k = filter_bytes; k < nk; ++k) - cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); - break; - case STBI__F_up: - for (k = 0; k < nk; ++k) - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); - break; - case STBI__F_avg: - for (k = 0; k < filter_bytes; ++k) - cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); - for (k = filter_bytes; k < nk; ++k) - cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); - break; - case STBI__F_paeth: - for (k = 0; k < filter_bytes; ++k) - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0) - for (k = filter_bytes; k < nk; ++k) - cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes])); - break; - case STBI__F_avg_first: - memcpy(cur, raw, filter_bytes); - for (k = filter_bytes; k < nk; ++k) - cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); - break; - } - - raw += nk; - - // expand decoded bits in cur to dest, also adding an extra alpha channel if desired - if (depth < 8) { - stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range - stbi_uc *in = cur; - stbi_uc *out = dest; - stbi_uc inb = 0; - stbi__uint32 nsmp = x*img_n; - - // expand bits to bytes first - if (depth == 4) { - for (i=0; i < nsmp; ++i) { - if ((i & 1) == 0) inb = *in++; - *out++ = scale * (inb >> 4); - inb <<= 4; + // handle first byte explicitly + for (k = 0; k < filter_bytes; ++k) { + switch (filter) { + case STBI__F_none: + cur[k] = raw[k]; + break; + case STBI__F_sub: + cur[k] = raw[k]; + break; + case STBI__F_up: + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); + break; + case STBI__F_avg: + cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1)); + break; + case STBI__F_paeth: + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0)); + break; + case STBI__F_avg_first: + cur[k] = raw[k]; + break; + case STBI__F_paeth_first: + cur[k] = raw[k]; + break; } - } else if (depth == 2) { - for (i=0; i < nsmp; ++i) { - if ((i & 3) == 0) inb = *in++; - *out++ = scale * (inb >> 6); - inb <<= 2; + } + + if (depth == 8) { + if (img_n != out_n) + cur[img_n] = 255; // first pixel + raw += img_n; + cur += out_n; + prior += out_n; + } else if (depth == 16) { + if (img_n != out_n) { + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte } - } else { - STBI_ASSERT(depth == 1); - for (i=0; i < nsmp; ++i) { - if ((i & 7) == 0) inb = *in++; - *out++ = scale * (inb >> 7); - inb <<= 1; + raw += filter_bytes; + cur += output_bytes; + prior += output_bytes; + } else { + raw += 1; + cur += 1; + prior += 1; + } + + // this is a little gross, so that we don't switch per-pixel or per-component + if (depth < 8 || img_n == out_n) { + int nk = (width - 1) * filter_bytes; +#define STBI__CASE(f) \ + case f: \ + for (k = 0; k < nk; ++k) + switch (filter) { + // "none" filter turns into a memcpy here; make that explicit. + case STBI__F_none: + memcpy(cur, raw, nk); + break; + STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); } + break; + STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } + break; + STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } + break; + STBI__CASE(STBI__F_paeth) { + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); + } + break; + STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } + break; + STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); } + break; } - } - - // insert alpha=255 values if desired - if (img_n != out_n) - stbi__create_png_alpha_expand8(dest, dest, x, img_n); - } else if (depth == 8) { - if (img_n == out_n) - memcpy(dest, cur, x*img_n); - else - stbi__create_png_alpha_expand8(dest, cur, x, img_n); - } else if (depth == 16) { - // convert the image data from big-endian to platform-native - stbi__uint16 *dest16 = (stbi__uint16*)dest; - stbi__uint32 nsmp = x*img_n; - - if (img_n == out_n) { - for (i = 0; i < nsmp; ++i, ++dest16, cur += 2) - *dest16 = (cur[0] << 8) | cur[1]; - } else { - STBI_ASSERT(img_n+1 == out_n); - if (img_n == 1) { - for (i = 0; i < x; ++i, dest16 += 2, cur += 2) { - dest16[0] = (cur[0] << 8) | cur[1]; - dest16[1] = 0xffff; - } - } else { - STBI_ASSERT(img_n == 3); - for (i = 0; i < x; ++i, dest16 += 4, cur += 6) { - dest16[0] = (cur[0] << 8) | cur[1]; - dest16[1] = (cur[2] << 8) | cur[3]; - dest16[2] = (cur[4] << 8) | cur[5]; - dest16[3] = 0xffff; - } +#undef STBI__CASE + raw += nk; + } else { + STBI_ASSERT(img_n + 1 == out_n); +#define STBI__CASE(f) \ + case f: \ + for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \ + for (k = 0; k < filter_bytes; ++k) + switch (filter) { + STBI__CASE(STBI__F_none) { cur[k] = raw[k]; } + break; + STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]); } + break; + STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } + break; + STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } + break; + STBI__CASE(STBI__F_paeth) { + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); + } + break; + STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } + break; + STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0)); } + break; } - } - } - } +#undef STBI__CASE - STBI_FREE(filter_buf); - if (!all_ok) return 0; + // the loop above sets the high byte of the pixels' alpha, but for + // 16 bit png files we also need the low byte set. we'll do that here. + if (depth == 16) { + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) { + cur[filter_bytes + 1] = 255; + } + } + } + } - return 1; + // we make a separate pass to expand bits to pixels; for performance, + // this could run two scanlines behind the above code, so it won't + // intefere with filtering but will still be in the cache. + if (depth < 8) { + for (j = 0; j < y; ++j) { + stbi_uc * cur = a->out + stride * j; + stbi_uc * in = a->out + stride * j + x * out_n - img_width_bytes; + // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for + // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that + // will be skipped in the later loop + stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + + // note that the final byte might overshoot and write more data than desired. + // we can allocate enough data that this never writes out of memory, but it + // could also overwrite the next scanline. can it overwrite non-empty data + // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. + // so we need to explicitly clamp the final ones + + if (depth == 4) { + for (k = x * img_n; k >= 2; k -= 2, ++in) { + *cur++ = scale * ((*in >> 4)); + *cur++ = scale * ((*in) & 0x0f); + } + if (k > 0) + *cur++ = scale * ((*in >> 4)); + } else if (depth == 2) { + for (k = x * img_n; k >= 4; k -= 4, ++in) { + *cur++ = scale * ((*in >> 6)); + *cur++ = scale * ((*in >> 4) & 0x03); + *cur++ = scale * ((*in >> 2) & 0x03); + *cur++ = scale * ((*in) & 0x03); + } + if (k > 0) + *cur++ = scale * ((*in >> 6)); + if (k > 1) + *cur++ = scale * ((*in >> 4) & 0x03); + if (k > 2) + *cur++ = scale * ((*in >> 2) & 0x03); + } else if (depth == 1) { + for (k = x * img_n; k >= 8; k -= 8, ++in) { + *cur++ = scale * ((*in >> 7)); + *cur++ = scale * ((*in >> 6) & 0x01); + *cur++ = scale * ((*in >> 5) & 0x01); + *cur++ = scale * ((*in >> 4) & 0x01); + *cur++ = scale * ((*in >> 3) & 0x01); + *cur++ = scale * ((*in >> 2) & 0x01); + *cur++ = scale * ((*in >> 1) & 0x01); + *cur++ = scale * ((*in) & 0x01); + } + if (k > 0) + *cur++ = scale * ((*in >> 7)); + if (k > 1) + *cur++ = scale * ((*in >> 6) & 0x01); + if (k > 2) + *cur++ = scale * ((*in >> 5) & 0x01); + if (k > 3) + *cur++ = scale * ((*in >> 4) & 0x01); + if (k > 4) + *cur++ = scale * ((*in >> 3) & 0x01); + if (k > 5) + *cur++ = scale * ((*in >> 2) & 0x01); + if (k > 6) + *cur++ = scale * ((*in >> 1) & 0x01); + } + if (img_n != out_n) { + int q; + // insert alpha = 255 + cur = a->out + stride * j; + if (img_n == 1) { + for (q = x - 1; q >= 0; --q) { + cur[q * 2 + 1] = 255; + cur[q * 2 + 0] = cur[q]; + } + } else { + STBI_ASSERT(img_n == 3); + for (q = x - 1; q >= 0; --q) { + cur[q * 4 + 3] = 255; + cur[q * 4 + 2] = cur[q * 3 + 2]; + cur[q * 4 + 1] = cur[q * 3 + 1]; + cur[q * 4 + 0] = cur[q * 3 + 0]; + } + } + } + } + } else if (depth == 16) { + // force the image data from big-endian to platform-native. + // this is done in a separate pass due to the decoding relying + // on the data being untouched, but could probably be done + // per-line during decode if care is taken. + stbi_uc * cur = a->out; + stbi__uint16 * cur16 = (stbi__uint16 *)cur; + + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) { + *cur16 = (cur[0] << 8) | cur[1]; + } + } + + return 1; } -static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced) -{ - int bytes = (depth == 16 ? 2 : 1); - int out_bytes = out_n * bytes; - stbi_uc *final; - int p; - if (!interlaced) - return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); +static int stbi__create_png_image(stbi__png * a, stbi_uc * image_data, stbi__uint32 image_data_len, int out_n, int depth, + int color, int interlaced) { + int bytes = (depth == 16 ? 2 : 1); + int out_bytes = out_n * bytes; + stbi_uc * final; + int p; + if (!interlaced) + return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); - // de-interlacing - final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); - if (!final) return stbi__err("outofmem", "Out of memory"); - for (p=0; p < 7; ++p) { - int xorig[] = { 0,4,0,2,0,1,0 }; - int yorig[] = { 0,0,4,0,2,0,1 }; - int xspc[] = { 8,8,4,4,2,2,1 }; - int yspc[] = { 8,8,8,4,4,2,2 }; - int i,j,x,y; - // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 - x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p]; - y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p]; - if (x && y) { - stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; - if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) { - STBI_FREE(final); - return 0; - } - for (j=0; j < y; ++j) { - for (i=0; i < x; ++i) { - int out_y = j*yspc[p]+yorig[p]; - int out_x = i*xspc[p]+xorig[p]; - memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes, - a->out + (j*x+i)*out_bytes, out_bytes); + // de-interlacing + final = (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + if (!final) + return stbi__err("outofmem", "Out of memory"); + for (p = 0; p < 7; ++p) { + int xorig[] = {0, 4, 0, 2, 0, 1, 0}; + int yorig[] = {0, 0, 4, 0, 2, 0, 1}; + int xspc[] = {8, 8, 4, 4, 2, 2, 1}; + int yspc[] = {8, 8, 8, 4, 4, 2, 2}; + int i, j, x, y; + // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 + x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; + y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; + if (x && y) { + stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; + if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) { + STBI_FREE(final); + return 0; } - } - STBI_FREE(a->out); - image_data += img_len; - image_data_len -= img_len; - } - } - a->out = final; + for (j = 0; j < y; ++j) { + for (i = 0; i < x; ++i) { + int out_y = j * yspc[p] + yorig[p]; + int out_x = i * xspc[p] + xorig[p]; + memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, a->out + (j * x + i) * out_bytes, + out_bytes); + } + } + STBI_FREE(a->out); + image_data += img_len; + image_data_len -= img_len; + } + } + a->out = final; - return 1; + return 1; } -static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n) -{ - stbi__context *s = z->s; - stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi_uc *p = z->out; +static int stbi__compute_transparency(stbi__png * z, stbi_uc tc[3], int out_n) { + stbi__context * s = z->s; + stbi__uint32 i, pixel_count = s->img_x * s->img_y; + stbi_uc * p = z->out; - // compute color-based transparency, assuming we've - // already got 255 as the alpha value in the output - STBI_ASSERT(out_n == 2 || out_n == 4); + // compute color-based transparency, assuming we've + // already got 255 as the alpha value in the output + STBI_ASSERT(out_n == 2 || out_n == 4); - if (out_n == 2) { - for (i=0; i < pixel_count; ++i) { - p[1] = (p[0] == tc[0] ? 0 : 255); - p += 2; - } - } else { - for (i=0; i < pixel_count; ++i) { - if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) - p[3] = 0; - p += 4; - } - } - return 1; + if (out_n == 2) { + for (i = 0; i < pixel_count; ++i) { + p[1] = (p[0] == tc[0] ? 0 : 255); + p += 2; + } + } else { + for (i = 0; i < pixel_count; ++i) { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; } -static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n) -{ - stbi__context *s = z->s; - stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi__uint16 *p = (stbi__uint16*) z->out; +static int stbi__compute_transparency16(stbi__png * z, stbi__uint16 tc[3], int out_n) { + stbi__context * s = z->s; + stbi__uint32 i, pixel_count = s->img_x * s->img_y; + stbi__uint16 * p = (stbi__uint16 *)z->out; - // compute color-based transparency, assuming we've - // already got 65535 as the alpha value in the output - STBI_ASSERT(out_n == 2 || out_n == 4); + // compute color-based transparency, assuming we've + // already got 65535 as the alpha value in the output + STBI_ASSERT(out_n == 2 || out_n == 4); - if (out_n == 2) { - for (i = 0; i < pixel_count; ++i) { - p[1] = (p[0] == tc[0] ? 0 : 65535); - p += 2; - } - } else { - for (i = 0; i < pixel_count; ++i) { - if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) - p[3] = 0; - p += 4; - } - } - return 1; + if (out_n == 2) { + for (i = 0; i < pixel_count; ++i) { + p[1] = (p[0] == tc[0] ? 0 : 65535); + p += 2; + } + } else { + for (i = 0; i < pixel_count; ++i) { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; } -static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n) -{ - stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y; - stbi_uc *p, *temp_out, *orig = a->out; +static int stbi__expand_png_palette(stbi__png * a, stbi_uc * palette, int len, int pal_img_n) { + stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y; + stbi_uc *p, *temp_out, *orig = a->out; - p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0); - if (p == NULL) return stbi__err("outofmem", "Out of memory"); + p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) + return stbi__err("outofmem", "Out of memory"); - // between here and free(out) below, exitting would leak - temp_out = p; + // between here and free(out) below, exitting would leak + temp_out = p; - if (pal_img_n == 3) { - for (i=0; i < pixel_count; ++i) { - int n = orig[i]*4; - p[0] = palette[n ]; - p[1] = palette[n+1]; - p[2] = palette[n+2]; - p += 3; - } - } else { - for (i=0; i < pixel_count; ++i) { - int n = orig[i]*4; - p[0] = palette[n ]; - p[1] = palette[n+1]; - p[2] = palette[n+2]; - p[3] = palette[n+3]; - p += 4; - } - } - STBI_FREE(a->out); - a->out = temp_out; + if (pal_img_n == 3) { + for (i = 0; i < pixel_count; ++i) { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p += 3; + } + } else { + for (i = 0; i < pixel_count; ++i) { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p[3] = palette[n + 3]; + p += 4; + } + } + STBI_FREE(a->out); + a->out = temp_out; - STBI_NOTUSED(len); + STBI_NOTUSED(len); - return 1; + return 1; } static int stbi__unpremultiply_on_load_global = 0; static int stbi__de_iphone_flag_global = 0; -STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) -{ - stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply; +STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) { + stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply; } -STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) -{ - stbi__de_iphone_flag_global = flag_true_if_should_convert; +STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) { + stbi__de_iphone_flag_global = flag_true_if_should_convert; } #ifndef STBI_THREAD_LOCAL -#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global -#define stbi__de_iphone_flag stbi__de_iphone_flag_global +#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global +#define stbi__de_iphone_flag stbi__de_iphone_flag_global #else static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set; static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set; -STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply) -{ - stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply; - stbi__unpremultiply_on_load_set = 1; +STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply) { + stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply; + stbi__unpremultiply_on_load_set = 1; } -STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert) -{ - stbi__de_iphone_flag_local = flag_true_if_should_convert; - stbi__de_iphone_flag_set = 1; +STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert) { + stbi__de_iphone_flag_local = flag_true_if_should_convert; + stbi__de_iphone_flag_set = 1; } -#define stbi__unpremultiply_on_load (stbi__unpremultiply_on_load_set \ - ? stbi__unpremultiply_on_load_local \ - : stbi__unpremultiply_on_load_global) -#define stbi__de_iphone_flag (stbi__de_iphone_flag_set \ - ? stbi__de_iphone_flag_local \ - : stbi__de_iphone_flag_global) +#define stbi__unpremultiply_on_load \ + (stbi__unpremultiply_on_load_set ? stbi__unpremultiply_on_load_local : stbi__unpremultiply_on_load_global) +#define stbi__de_iphone_flag (stbi__de_iphone_flag_set ? stbi__de_iphone_flag_local : stbi__de_iphone_flag_global) #endif // STBI_THREAD_LOCAL -static void stbi__de_iphone(stbi__png *z) -{ - stbi__context *s = z->s; - stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi_uc *p = z->out; +static void stbi__de_iphone(stbi__png * z) { + stbi__context * s = z->s; + stbi__uint32 i, pixel_count = s->img_x * s->img_y; + stbi_uc * p = z->out; - if (s->img_out_n == 3) { // convert bgr to rgb - for (i=0; i < pixel_count; ++i) { - stbi_uc t = p[0]; - p[0] = p[2]; - p[2] = t; - p += 3; - } - } else { - STBI_ASSERT(s->img_out_n == 4); - if (stbi__unpremultiply_on_load) { - // convert bgr to rgb and unpremultiply - for (i=0; i < pixel_count; ++i) { - stbi_uc a = p[3]; - stbi_uc t = p[0]; - if (a) { - stbi_uc half = a / 2; - p[0] = (p[2] * 255 + half) / a; - p[1] = (p[1] * 255 + half) / a; - p[2] = ( t * 255 + half) / a; - } else { - p[0] = p[2]; - p[2] = t; - } - p += 4; - } - } else { - // convert bgr to rgb - for (i=0; i < pixel_count; ++i) { + if (s->img_out_n == 3) { // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { stbi_uc t = p[0]; p[0] = p[2]; p[2] = t; - p += 4; - } - } - } + p += 3; + } + } else { + STBI_ASSERT(s->img_out_n == 4); + if (stbi__unpremultiply_on_load) { + // convert bgr to rgb and unpremultiply + for (i = 0; i < pixel_count; ++i) { + stbi_uc a = p[3]; + stbi_uc t = p[0]; + if (a) { + stbi_uc half = a / 2; + p[0] = (p[2] * 255 + half) / a; + p[1] = (p[1] * 255 + half) / a; + p[2] = (t * 255 + half) / a; + } else { + p[0] = p[2]; + p[2] = t; + } + p += 4; + } + } else { + // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { + stbi_uc t = p[0]; + p[0] = p[2]; + p[2] = t; + p += 4; + } + } + } } -#define STBI__PNG_TYPE(a,b,c,d) (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d)) +#define STBI__PNG_TYPE(a, b, c, d) (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d)) -static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) -{ - stbi_uc palette[1024], pal_img_n=0; - stbi_uc has_trans=0, tc[3]={0}; - stbi__uint16 tc16[3]; - stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0; - int first=1,k,interlace=0, color=0, is_iphone=0; - stbi__context *s = z->s; +static int stbi__parse_png_file(stbi__png * z, int scan, int req_comp) { + stbi_uc palette[1024], pal_img_n = 0; + stbi_uc has_trans = 0, tc[3] = {0}; + stbi__uint16 tc16[3]; + stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0; + int first = 1, k, interlace = 0, color = 0, is_iphone = 0; + stbi__context * s = z->s; - z->expanded = NULL; - z->idata = NULL; - z->out = NULL; + z->expanded = NULL; + z->idata = NULL; + z->out = NULL; - if (!stbi__check_png_header(s)) return 0; + if (!stbi__check_png_header(s)) + return 0; - if (scan == STBI__SCAN_type) return 1; + if (scan == STBI__SCAN_type) + return 1; - for (;;) { - stbi__pngchunk c = stbi__get_chunk_header(s); - switch (c.type) { - case STBI__PNG_TYPE('C','g','B','I'): + for (;;) { + stbi__pngchunk c = stbi__get_chunk_header(s); + switch (c.type) { + case STBI__PNG_TYPE('C', 'g', 'B', 'I'): is_iphone = 1; stbi__skip(s, c.length); break; - case STBI__PNG_TYPE('I','H','D','R'): { - int comp,filter; - if (!first) return stbi__err("multiple IHDR","Corrupt PNG"); + case STBI__PNG_TYPE('I', 'H', 'D', 'R'): { + int comp, filter; + if (!first) + return stbi__err("multiple IHDR", "Corrupt PNG"); first = 0; - if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG"); + if (c.length != 13) + return stbi__err("bad IHDR len", "Corrupt PNG"); s->img_x = stbi__get32be(s); s->img_y = stbi__get32be(s); - if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); - if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); - z->depth = stbi__get8(s); if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only"); - color = stbi__get8(s); if (color > 6) return stbi__err("bad ctype","Corrupt PNG"); - if (color == 3 && z->depth == 16) return stbi__err("bad ctype","Corrupt PNG"); - if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG"); - comp = stbi__get8(s); if (comp) return stbi__err("bad comp method","Corrupt PNG"); - filter= stbi__get8(s); if (filter) return stbi__err("bad filter method","Corrupt PNG"); - interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG"); - if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG"); + if (s->img_y > STBI_MAX_DIMENSIONS) + return stbi__err("too large", "Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) + return stbi__err("too large", "Very large image (corrupt?)"); + z->depth = stbi__get8(s); + if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) + return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); + color = stbi__get8(s); + if (color > 6) + return stbi__err("bad ctype", "Corrupt PNG"); + if (color == 3 && z->depth == 16) + return stbi__err("bad ctype", "Corrupt PNG"); + if (color == 3) + pal_img_n = 3; + else if (color & 1) + return stbi__err("bad ctype", "Corrupt PNG"); + comp = stbi__get8(s); + if (comp) + return stbi__err("bad comp method", "Corrupt PNG"); + filter = stbi__get8(s); + if (filter) + return stbi__err("bad filter method", "Corrupt PNG"); + interlace = stbi__get8(s); + if (interlace > 1) + return stbi__err("bad interlace method", "Corrupt PNG"); + if (!s->img_x || !s->img_y) + return stbi__err("0-pixel image", "Corrupt PNG"); if (!pal_img_n) { - s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); - if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode"); + s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); + if ((1 << 30) / s->img_x / s->img_n < s->img_y) + return stbi__err("too large", "Image too large to decode"); } else { - // if paletted, then pal_n is our final components, and - // img_n is # components to decompress/filter. - s->img_n = 1; - if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG"); + // if paletted, then pal_n is our final components, and + // img_n is # components to decompress/filter. + s->img_n = 1; + if ((1 << 30) / s->img_x / 4 < s->img_y) + return stbi__err("too large", "Corrupt PNG"); } // even with SCAN_header, have to scan to see if we have a tRNS break; - } + } - case STBI__PNG_TYPE('P','L','T','E'): { - if (first) return stbi__err("first not IHDR", "Corrupt PNG"); - if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG"); + case STBI__PNG_TYPE('P', 'L', 'T', 'E'): { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (c.length > 256 * 3) + return stbi__err("invalid PLTE", "Corrupt PNG"); pal_len = c.length / 3; - if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG"); - for (i=0; i < pal_len; ++i) { - palette[i*4+0] = stbi__get8(s); - palette[i*4+1] = stbi__get8(s); - palette[i*4+2] = stbi__get8(s); - palette[i*4+3] = 255; + if (pal_len * 3 != c.length) + return stbi__err("invalid PLTE", "Corrupt PNG"); + for (i = 0; i < pal_len; ++i) { + palette[i * 4 + 0] = stbi__get8(s); + palette[i * 4 + 1] = stbi__get8(s); + palette[i * 4 + 2] = stbi__get8(s); + palette[i * 4 + 3] = 255; } break; - } + } - case STBI__PNG_TYPE('t','R','N','S'): { - if (first) return stbi__err("first not IHDR", "Corrupt PNG"); - if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG"); + case STBI__PNG_TYPE('t', 'R', 'N', 'S'): { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (z->idata) + return stbi__err("tRNS after IDAT", "Corrupt PNG"); if (pal_img_n) { - if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; } - if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG"); - if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG"); - pal_img_n = 4; - for (i=0; i < c.length; ++i) - palette[i*4+3] = stbi__get8(s); + if (scan == STBI__SCAN_header) { + s->img_n = 4; + return 1; + } + if (pal_len == 0) + return stbi__err("tRNS before PLTE", "Corrupt PNG"); + if (c.length > pal_len) + return stbi__err("bad tRNS len", "Corrupt PNG"); + pal_img_n = 4; + for (i = 0; i < c.length; ++i) + palette[i * 4 + 3] = stbi__get8(s); } else { - if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG"); - if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG"); - has_trans = 1; - // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now. - if (scan == STBI__SCAN_header) { ++s->img_n; return 1; } - if (z->depth == 16) { - for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning - tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is - } else { - for (k = 0; k < s->img_n && k < 3; ++k) - tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger - } + if (!(s->img_n & 1)) + return stbi__err("tRNS with alpha", "Corrupt PNG"); + if (c.length != (stbi__uint32)s->img_n * 2) + return stbi__err("bad tRNS len", "Corrupt PNG"); + has_trans = 1; + // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now. + if (scan == STBI__SCAN_header) { + ++s->img_n; + return 1; + } + if (z->depth == 16) { + for (k = 0; k < s->img_n; ++k) + tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is + } else { + for (k = 0; k < s->img_n; ++k) + tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * + stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger + } } break; - } + } - case STBI__PNG_TYPE('I','D','A','T'): { - if (first) return stbi__err("first not IHDR", "Corrupt PNG"); - if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG"); + case STBI__PNG_TYPE('I', 'D', 'A', 'T'): { + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (pal_img_n && !pal_len) + return stbi__err("no PLTE", "Corrupt PNG"); if (scan == STBI__SCAN_header) { - // header scan definitely stops at first IDAT - if (pal_img_n) - s->img_n = pal_img_n; - return 1; + // header scan definitely stops at first IDAT + if (pal_img_n) + s->img_n = pal_img_n; + return 1; } - if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes"); - if ((int)(ioff + c.length) < (int)ioff) return 0; + if (c.length > (1u << 30)) + return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes"); + if ((int)(ioff + c.length) < (int)ioff) + return 0; if (ioff + c.length > idata_limit) { - stbi__uint32 idata_limit_old = idata_limit; - stbi_uc *p; - if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096; - while (ioff + c.length > idata_limit) - idata_limit *= 2; - STBI_NOTUSED(idata_limit_old); - p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory"); - z->idata = p; + stbi__uint32 idata_limit_old = idata_limit; + stbi_uc * p; + if (idata_limit == 0) + idata_limit = c.length > 4096 ? c.length : 4096; + while (ioff + c.length > idata_limit) + idata_limit *= 2; + STBI_NOTUSED(idata_limit_old); + p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); + if (p == NULL) + return stbi__err("outofmem", "Out of memory"); + z->idata = p; } - if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG"); + if (!stbi__getn(s, z->idata + ioff, c.length)) + return stbi__err("outofdata", "Corrupt PNG"); ioff += c.length; break; - } + } - case STBI__PNG_TYPE('I','E','N','D'): { + case STBI__PNG_TYPE('I', 'E', 'N', 'D'): { stbi__uint32 raw_len, bpl; - if (first) return stbi__err("first not IHDR", "Corrupt PNG"); - if (scan != STBI__SCAN_load) return 1; - if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG"); + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); + if (scan != STBI__SCAN_load) + return 1; + if (z->idata == NULL) + return stbi__err("no IDAT", "Corrupt PNG"); // initial guess for decoded data size to avoid unnecessary reallocs bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; - z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone); - if (z->expanded == NULL) return 0; // zlib should set error - STBI_FREE(z->idata); z->idata = NULL; - if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans) - s->img_out_n = s->img_n+1; + z->expanded = (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag((char *)z->idata, ioff, raw_len, + (int *)&raw_len, !is_iphone); + if (z->expanded == NULL) + return 0; // zlib should set error + STBI_FREE(z->idata); + z->idata = NULL; + if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) + s->img_out_n = s->img_n + 1; else - s->img_out_n = s->img_n; - if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0; + s->img_out_n = s->img_n; + if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) + return 0; if (has_trans) { - if (z->depth == 16) { - if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0; - } else { - if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0; - } + if (z->depth == 16) { + if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) + return 0; + } else { + if (!stbi__compute_transparency(z, tc, s->img_out_n)) + return 0; + } } if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) - stbi__de_iphone(z); + stbi__de_iphone(z); if (pal_img_n) { - // pal_img_n == 3 or 4 - s->img_n = pal_img_n; // record the actual colors we had - s->img_out_n = pal_img_n; - if (req_comp >= 3) s->img_out_n = req_comp; - if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) - return 0; + // pal_img_n == 3 or 4 + s->img_n = pal_img_n; // record the actual colors we had + s->img_out_n = pal_img_n; + if (req_comp >= 3) + s->img_out_n = req_comp; + if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) + return 0; } else if (has_trans) { - // non-paletted image with tRNS -> source image has (constant) alpha - ++s->img_n; + // non-paletted image with tRNS -> source image has (constant) alpha + ++s->img_n; } - STBI_FREE(z->expanded); z->expanded = NULL; + STBI_FREE(z->expanded); + z->expanded = NULL; // end of PNG chunk, read and skip CRC stbi__get32be(s); return 1; - } + } - default: + default: // if critical, fail - if (first) return stbi__err("first not IHDR", "Corrupt PNG"); + if (first) + return stbi__err("first not IHDR", "Corrupt PNG"); if ((c.type & (1 << 29)) == 0) { - #ifndef STBI_NO_FAILURE_STRINGS - // not threadsafe - static char invalid_chunk[] = "XXXX PNG chunk not known"; - invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); - invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); - invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); - invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); - #endif - return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); +#ifndef STBI_NO_FAILURE_STRINGS + // not threadsafe + static char invalid_chunk[] = "XXXX PNG chunk not known"; + invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); + invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); + invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); + invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); +#endif + return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); } stbi__skip(s, c.length); break; - } - // end of PNG chunk, read and skip CRC - stbi__get32be(s); - } + } + // end of PNG chunk, read and skip CRC + stbi__get32be(s); + } } -static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri) -{ - void *result=NULL; - if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); - if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) { - if (p->depth <= 8) - ri->bits_per_channel = 8; - else if (p->depth == 16) - ri->bits_per_channel = 16; - else - return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth"); - result = p->out; - p->out = NULL; - if (req_comp && req_comp != p->s->img_out_n) { - if (ri->bits_per_channel == 8) - result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); - else - result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); - p->s->img_out_n = req_comp; - if (result == NULL) return result; - } - *x = p->s->img_x; - *y = p->s->img_y; - if (n) *n = p->s->img_n; - } - STBI_FREE(p->out); p->out = NULL; - STBI_FREE(p->expanded); p->expanded = NULL; - STBI_FREE(p->idata); p->idata = NULL; +static void * stbi__do_png(stbi__png * p, int * x, int * y, int * n, int req_comp, stbi__result_info * ri) { + void * result = NULL; + if (req_comp < 0 || req_comp > 4) + return stbi__errpuc("bad req_comp", "Internal error"); + if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) { + if (p->depth <= 8) + ri->bits_per_channel = 8; + else if (p->depth == 16) + ri->bits_per_channel = 16; + else + return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth"); + result = p->out; + p->out = NULL; + if (req_comp && req_comp != p->s->img_out_n) { + if (ri->bits_per_channel == 8) + result = stbi__convert_format((unsigned char *)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + else + result = stbi__convert_format16((stbi__uint16 *)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + p->s->img_out_n = req_comp; + if (result == NULL) + return result; + } + *x = p->s->img_x; + *y = p->s->img_y; + if (n) + *n = p->s->img_n; + } + STBI_FREE(p->out); + p->out = NULL; + STBI_FREE(p->expanded); + p->expanded = NULL; + STBI_FREE(p->idata); + p->idata = NULL; - return result; + return result; } -static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - stbi__png p; - p.s = s; - return stbi__do_png(&p, x,y,comp,req_comp, ri); +static void * stbi__png_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + stbi__png p; + p.s = s; + return stbi__do_png(&p, x, y, comp, req_comp, ri); } -static int stbi__png_test(stbi__context *s) -{ - int r; - r = stbi__check_png_header(s); - stbi__rewind(s); - return r; +static int stbi__png_test(stbi__context * s) { + int r; + r = stbi__check_png_header(s); + stbi__rewind(s); + return r; } -static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp) -{ - if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) { - stbi__rewind( p->s ); - return 0; - } - if (x) *x = p->s->img_x; - if (y) *y = p->s->img_y; - if (comp) *comp = p->s->img_n; - return 1; +static int stbi__png_info_raw(stbi__png * p, int * x, int * y, int * comp) { + if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) { + stbi__rewind(p->s); + return 0; + } + if (x) + *x = p->s->img_x; + if (y) + *y = p->s->img_y; + if (comp) + *comp = p->s->img_n; + return 1; } -static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp) -{ - stbi__png p; - p.s = s; - return stbi__png_info_raw(&p, x, y, comp); +static int stbi__png_info(stbi__context * s, int * x, int * y, int * comp) { + stbi__png p; + p.s = s; + return stbi__png_info_raw(&p, x, y, comp); } -static int stbi__png_is16(stbi__context *s) -{ - stbi__png p; - p.s = s; - if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) - return 0; - if (p.depth != 16) { - stbi__rewind(p.s); - return 0; - } - return 1; +static int stbi__png_is16(stbi__context * s) { + stbi__png p; + p.s = s; + if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) + return 0; + if (p.depth != 16) { + stbi__rewind(p.s); + return 0; + } + return 1; } #endif // Microsoft/Windows BMP image #ifndef STBI_NO_BMP -static int stbi__bmp_test_raw(stbi__context *s) -{ - int r; - int sz; - if (stbi__get8(s) != 'B') return 0; - if (stbi__get8(s) != 'M') return 0; - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved - stbi__get32le(s); // discard data offset - sz = stbi__get32le(s); - r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124); - return r; +static int stbi__bmp_test_raw(stbi__context * s) { + int r; + int sz; + if (stbi__get8(s) != 'B') + return 0; + if (stbi__get8(s) != 'M') + return 0; + stbi__get32le(s); // discard filesize + stbi__get16le(s); // discard reserved + stbi__get16le(s); // discard reserved + stbi__get32le(s); // discard data offset + sz = stbi__get32le(s); + r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124); + return r; } -static int stbi__bmp_test(stbi__context *s) -{ - int r = stbi__bmp_test_raw(s); - stbi__rewind(s); - return r; +static int stbi__bmp_test(stbi__context * s) { + int r = stbi__bmp_test_raw(s); + stbi__rewind(s); + return r; } - // returns 0..31 for the highest set bit -static int stbi__high_bit(unsigned int z) -{ - int n=0; - if (z == 0) return -1; - if (z >= 0x10000) { n += 16; z >>= 16; } - if (z >= 0x00100) { n += 8; z >>= 8; } - if (z >= 0x00010) { n += 4; z >>= 4; } - if (z >= 0x00004) { n += 2; z >>= 2; } - if (z >= 0x00002) { n += 1;/* >>= 1;*/ } - return n; +static int stbi__high_bit(unsigned int z) { + int n = 0; + if (z == 0) + return -1; + if (z >= 0x10000) { + n += 16; + z >>= 16; + } + if (z >= 0x00100) { + n += 8; + z >>= 8; + } + if (z >= 0x00010) { + n += 4; + z >>= 4; + } + if (z >= 0x00004) { + n += 2; + z >>= 2; + } + if (z >= 0x00002) { + n += 1; /* >>= 1;*/ + } + return n; } -static int stbi__bitcount(unsigned int a) -{ - a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 - a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 - a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits - a = (a + (a >> 8)); // max 16 per 8 bits - a = (a + (a >> 16)); // max 32 per 8 bits - return a & 0xff; +static int stbi__bitcount(unsigned int a) { + a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 + a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 + a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits + a = (a + (a >> 8)); // max 16 per 8 bits + a = (a + (a >> 16)); // max 32 per 8 bits + return a & 0xff; } // extract an arbitrarily-aligned N-bit value (N=bits) // from v, and then make it 8-bits long and fractionally // extend it to full full range. -static int stbi__shiftsigned(unsigned int v, int shift, int bits) -{ - static unsigned int mul_table[9] = { - 0, - 0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/, - 0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/, - }; - static unsigned int shift_table[9] = { - 0, 0,0,1,0,2,4,6,0, - }; - if (shift < 0) - v <<= -shift; - else - v >>= shift; - STBI_ASSERT(v < 256); - v >>= (8-bits); - STBI_ASSERT(bits >= 0 && bits <= 8); - return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits]; +static int stbi__shiftsigned(unsigned int v, int shift, int bits) { + static unsigned int mul_table[9] = { + 0, + 0xff /*0b11111111*/, + 0x55 /*0b01010101*/, + 0x49 /*0b01001001*/, + 0x11 /*0b00010001*/, + 0x21 /*0b00100001*/, + 0x41 /*0b01000001*/, + 0x81 /*0b10000001*/, + 0x01 /*0b00000001*/, + }; + static unsigned int shift_table[9] = { + 0, 0, 0, 1, 0, 2, 4, 6, 0, + }; + if (shift < 0) + v <<= -shift; + else + v >>= shift; + STBI_ASSERT(v < 256); + v >>= (8 - bits); + STBI_ASSERT(bits >= 0 && bits <= 8); + return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits]; } -typedef struct -{ - int bpp, offset, hsz; - unsigned int mr,mg,mb,ma, all_a; - int extra_read; +typedef struct { + int bpp, offset, hsz; + unsigned int mr, mg, mb, ma, all_a; + int extra_read; } stbi__bmp_data; -static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress) -{ - // BI_BITFIELDS specifies masks explicitly, don't override - if (compress == 3) - return 1; +static int stbi__bmp_set_mask_defaults(stbi__bmp_data * info, int compress) { + // BI_BITFIELDS specifies masks explicitly, don't override + if (compress == 3) + return 1; - if (compress == 0) { - if (info->bpp == 16) { - info->mr = 31u << 10; - info->mg = 31u << 5; - info->mb = 31u << 0; - } else if (info->bpp == 32) { - info->mr = 0xffu << 16; - info->mg = 0xffu << 8; - info->mb = 0xffu << 0; - info->ma = 0xffu << 24; - info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 - } else { - // otherwise, use defaults, which is all-0 - info->mr = info->mg = info->mb = info->ma = 0; - } - return 1; - } - return 0; // error + if (compress == 0) { + if (info->bpp == 16) { + info->mr = 31u << 10; + info->mg = 31u << 5; + info->mb = 31u << 0; + } else if (info->bpp == 32) { + info->mr = 0xffu << 16; + info->mg = 0xffu << 8; + info->mb = 0xffu << 0; + info->ma = 0xffu << 24; + info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 + } else { + // otherwise, use defaults, which is all-0 + info->mr = info->mg = info->mb = info->ma = 0; + } + return 1; + } + return 0; // error } -static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) -{ - int hsz; - if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP"); - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved - info->offset = stbi__get32le(s); - info->hsz = hsz = stbi__get32le(s); - info->mr = info->mg = info->mb = info->ma = 0; - info->extra_read = 14; +static void * stbi__bmp_parse_header(stbi__context * s, stbi__bmp_data * info) { + int hsz; + if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') + return stbi__errpuc("not BMP", "Corrupt BMP"); + stbi__get32le(s); // discard filesize + stbi__get16le(s); // discard reserved + stbi__get16le(s); // discard reserved + info->offset = stbi__get32le(s); + info->hsz = hsz = stbi__get32le(s); + info->mr = info->mg = info->mb = info->ma = 0; + info->extra_read = 14; - if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP"); + if (info->offset < 0) + return stbi__errpuc("bad BMP", "bad BMP"); - if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown"); - if (hsz == 12) { - s->img_x = stbi__get16le(s); - s->img_y = stbi__get16le(s); - } else { - s->img_x = stbi__get32le(s); - s->img_y = stbi__get32le(s); - } - if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP"); - info->bpp = stbi__get16le(s); - if (hsz != 12) { - int compress = stbi__get32le(s); - if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); - if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes - if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel - stbi__get32le(s); // discard sizeof - stbi__get32le(s); // discard hres - stbi__get32le(s); // discard vres - stbi__get32le(s); // discard colorsused - stbi__get32le(s); // discard max important - if (hsz == 40 || hsz == 56) { - if (hsz == 56) { - stbi__get32le(s); - stbi__get32le(s); - stbi__get32le(s); - stbi__get32le(s); - } - if (info->bpp == 16 || info->bpp == 32) { - if (compress == 0) { - stbi__bmp_set_mask_defaults(info, compress); - } else if (compress == 3) { - info->mr = stbi__get32le(s); - info->mg = stbi__get32le(s); - info->mb = stbi__get32le(s); - info->extra_read += 12; - // not documented, but generated by photoshop and handled by mspaint - if (info->mr == info->mg && info->mg == info->mb) { - // ?!?!? - return stbi__errpuc("bad BMP", "bad BMP"); - } - } else - return stbi__errpuc("bad BMP", "bad BMP"); - } - } else { - // V4/V5 header - int i; - if (hsz != 108 && hsz != 124) - return stbi__errpuc("bad BMP", "bad BMP"); - info->mr = stbi__get32le(s); - info->mg = stbi__get32le(s); - info->mb = stbi__get32le(s); - info->ma = stbi__get32le(s); - if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs - stbi__bmp_set_mask_defaults(info, compress); - stbi__get32le(s); // discard color space - for (i=0; i < 12; ++i) - stbi__get32le(s); // discard color space parameters - if (hsz == 124) { - stbi__get32le(s); // discard rendering intent - stbi__get32le(s); // discard offset of profile data - stbi__get32le(s); // discard size of profile data - stbi__get32le(s); // discard reserved - } - } - } - return (void *) 1; + if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) + return stbi__errpuc("unknown BMP", "BMP type not supported: unknown"); + if (hsz == 12) { + s->img_x = stbi__get16le(s); + s->img_y = stbi__get16le(s); + } else { + s->img_x = stbi__get32le(s); + s->img_y = stbi__get32le(s); + } + if (stbi__get16le(s) != 1) + return stbi__errpuc("bad BMP", "bad BMP"); + info->bpp = stbi__get16le(s); + if (hsz != 12) { + int compress = stbi__get32le(s); + if (compress == 1 || compress == 2) + return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); + if (compress >= 4) + return stbi__errpuc("BMP JPEG/PNG", + "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes + if (compress == 3 && info->bpp != 16 && info->bpp != 32) + return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel + stbi__get32le(s); // discard sizeof + stbi__get32le(s); // discard hres + stbi__get32le(s); // discard vres + stbi__get32le(s); // discard colorsused + stbi__get32le(s); // discard max important + if (hsz == 40 || hsz == 56) { + if (hsz == 56) { + stbi__get32le(s); + stbi__get32le(s); + stbi__get32le(s); + stbi__get32le(s); + } + if (info->bpp == 16 || info->bpp == 32) { + if (compress == 0) { + stbi__bmp_set_mask_defaults(info, compress); + } else if (compress == 3) { + info->mr = stbi__get32le(s); + info->mg = stbi__get32le(s); + info->mb = stbi__get32le(s); + info->extra_read += 12; + // not documented, but generated by photoshop and handled by mspaint + if (info->mr == info->mg && info->mg == info->mb) { + // ?!?!? + return stbi__errpuc("bad BMP", "bad BMP"); + } + } else + return stbi__errpuc("bad BMP", "bad BMP"); + } + } else { + // V4/V5 header + int i; + if (hsz != 108 && hsz != 124) + return stbi__errpuc("bad BMP", "bad BMP"); + info->mr = stbi__get32le(s); + info->mg = stbi__get32le(s); + info->mb = stbi__get32le(s); + info->ma = stbi__get32le(s); + if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs + stbi__bmp_set_mask_defaults(info, compress); + stbi__get32le(s); // discard color space + for (i = 0; i < 12; ++i) + stbi__get32le(s); // discard color space parameters + if (hsz == 124) { + stbi__get32le(s); // discard rendering intent + stbi__get32le(s); // discard offset of profile data + stbi__get32le(s); // discard size of profile data + stbi__get32le(s); // discard reserved + } + } + } + return (void *)1; } +static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + stbi_uc * out; + unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a; + stbi_uc pal[256][4]; + int psize = 0, i, j, width; + int flip_vertically, pad, target; + stbi__bmp_data info; + STBI_NOTUSED(ri); -static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - stbi_uc *out; - unsigned int mr=0,mg=0,mb=0,ma=0, all_a; - stbi_uc pal[256][4]; - int psize=0,i,j,width; - int flip_vertically, pad, target; - stbi__bmp_data info; - STBI_NOTUSED(ri); + info.all_a = 255; + if (stbi__bmp_parse_header(s, &info) == NULL) + return NULL; // error code already set - info.all_a = 255; - if (stbi__bmp_parse_header(s, &info) == NULL) - return NULL; // error code already set + flip_vertically = ((int)s->img_y) > 0; + s->img_y = abs((int)s->img_y); - flip_vertically = ((int) s->img_y) > 0; - s->img_y = abs((int) s->img_y); + if (s->img_y > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); - if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); - if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + mr = info.mr; + mg = info.mg; + mb = info.mb; + ma = info.ma; + all_a = info.all_a; - mr = info.mr; - mg = info.mg; - mb = info.mb; - ma = info.ma; - all_a = info.all_a; + if (info.hsz == 12) { + if (info.bpp < 24) + psize = (info.offset - info.extra_read - 24) / 3; + } else { + if (info.bpp < 16) + psize = (info.offset - info.extra_read - info.hsz) >> 2; + } + if (psize == 0) { + // accept some number of extra bytes after the header, but if the offset points either to before + // the header ends or implies a large amount of extra data, reject the file as malformed + int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original); + int header_limit = 1024; // max we actually read is below 256 bytes currently. + int extra_data_limit = 256 * 4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size. + if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) { + return stbi__errpuc("bad header", "Corrupt BMP"); + } + // we established that bytes_read_so_far is positive and sensible. + // the first half of this test rejects offsets that are either too small positives, or + // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn + // ensures the number computed in the second half of the test can't overflow. + if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) { + return stbi__errpuc("bad offset", "Corrupt BMP"); + } else { + stbi__skip(s, info.offset - bytes_read_so_far); + } + } - if (info.hsz == 12) { - if (info.bpp < 24) - psize = (info.offset - info.extra_read - 24) / 3; - } else { - if (info.bpp < 16) - psize = (info.offset - info.extra_read - info.hsz) >> 2; - } - if (psize == 0) { - // accept some number of extra bytes after the header, but if the offset points either to before - // the header ends or implies a large amount of extra data, reject the file as malformed - int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original); - int header_limit = 1024; // max we actually read is below 256 bytes currently. - int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size. - if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) { - return stbi__errpuc("bad header", "Corrupt BMP"); - } - // we established that bytes_read_so_far is positive and sensible. - // the first half of this test rejects offsets that are either too small positives, or - // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn - // ensures the number computed in the second half of the test can't overflow. - if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) { - return stbi__errpuc("bad offset", "Corrupt BMP"); - } else { - stbi__skip(s, info.offset - bytes_read_so_far); - } - } + if (info.bpp == 24 && ma == 0xff000000) + s->img_n = 3; + else + s->img_n = ma ? 4 : 3; + if (req_comp && req_comp >= 3) // we can directly decode 3 or 4 + target = req_comp; + else + target = s->img_n; // if they want monochrome, we'll post-convert - if (info.bpp == 24 && ma == 0xff000000) - s->img_n = 3; - else - s->img_n = ma ? 4 : 3; - if (req_comp && req_comp >= 3) // we can directly decode 3 or 4 - target = req_comp; - else - target = s->img_n; // if they want monochrome, we'll post-convert + // sanity-check size + if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) + return stbi__errpuc("too large", "Corrupt BMP"); - // sanity-check size - if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) - return stbi__errpuc("too large", "Corrupt BMP"); - - out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0); - if (!out) return stbi__errpuc("outofmem", "Out of memory"); - if (info.bpp < 16) { - int z=0; - if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); } - for (i=0; i < psize; ++i) { - pal[i][2] = stbi__get8(s); - pal[i][1] = stbi__get8(s); - pal[i][0] = stbi__get8(s); - if (info.hsz != 12) stbi__get8(s); - pal[i][3] = 255; - } - stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); - if (info.bpp == 1) width = (s->img_x + 7) >> 3; - else if (info.bpp == 4) width = (s->img_x + 1) >> 1; - else if (info.bpp == 8) width = s->img_x; - else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); } - pad = (-width)&3; - if (info.bpp == 1) { - for (j=0; j < (int) s->img_y; ++j) { - int bit_offset = 7, v = stbi__get8(s); - for (i=0; i < (int) s->img_x; ++i) { - int color = (v>>bit_offset)&0x1; - out[z++] = pal[color][0]; - out[z++] = pal[color][1]; - out[z++] = pal[color][2]; - if (target == 4) out[z++] = 255; - if (i+1 == (int) s->img_x) break; - if((--bit_offset) < 0) { - bit_offset = 7; - v = stbi__get8(s); - } + out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0); + if (!out) + return stbi__errpuc("outofmem", "Out of memory"); + if (info.bpp < 16) { + int z = 0; + if (psize == 0 || psize > 256) { + STBI_FREE(out); + return stbi__errpuc("invalid", "Corrupt BMP"); + } + for (i = 0; i < psize; ++i) { + pal[i][2] = stbi__get8(s); + pal[i][1] = stbi__get8(s); + pal[i][0] = stbi__get8(s); + if (info.hsz != 12) + stbi__get8(s); + pal[i][3] = 255; + } + stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); + if (info.bpp == 1) + width = (s->img_x + 7) >> 3; + else if (info.bpp == 4) + width = (s->img_x + 1) >> 1; + else if (info.bpp == 8) + width = s->img_x; + else { + STBI_FREE(out); + return stbi__errpuc("bad bpp", "Corrupt BMP"); + } + pad = (-width) & 3; + if (info.bpp == 1) { + for (j = 0; j < (int)s->img_y; ++j) { + int bit_offset = 7, v = stbi__get8(s); + for (i = 0; i < (int)s->img_x; ++i) { + int color = (v >> bit_offset) & 0x1; + out[z++] = pal[color][0]; + out[z++] = pal[color][1]; + out[z++] = pal[color][2]; + if (target == 4) + out[z++] = 255; + if (i + 1 == (int)s->img_x) + break; + if ((--bit_offset) < 0) { + bit_offset = 7; + v = stbi__get8(s); + } + } + stbi__skip(s, pad); + } + } else { + for (j = 0; j < (int)s->img_y; ++j) { + for (i = 0; i < (int)s->img_x; i += 2) { + int v = stbi__get8(s), v2 = 0; + if (info.bpp == 4) { + v2 = v & 15; + v >>= 4; + } + out[z++] = pal[v][0]; + out[z++] = pal[v][1]; + out[z++] = pal[v][2]; + if (target == 4) + out[z++] = 255; + if (i + 1 == (int)s->img_x) + break; + v = (info.bpp == 8) ? stbi__get8(s) : v2; + out[z++] = pal[v][0]; + out[z++] = pal[v][1]; + out[z++] = pal[v][2]; + if (target == 4) + out[z++] = 255; + } + stbi__skip(s, pad); + } + } + } else { + int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0, gcount = 0, bcount = 0, acount = 0; + int z = 0; + int easy = 0; + stbi__skip(s, info.offset - info.extra_read - info.hsz); + if (info.bpp == 24) + width = 3 * s->img_x; + else if (info.bpp == 16) + width = 2 * s->img_x; + else /* bpp = 32 and pad = 0 */ + width = 0; + pad = (-width) & 3; + if (info.bpp == 24) { + easy = 1; + } else if (info.bpp == 32) { + if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) + easy = 2; + } + if (!easy) { + if (!mr || !mg || !mb) { + STBI_FREE(out); + return stbi__errpuc("bad masks", "Corrupt BMP"); + } + // right shift amt to put high bit in position #7 + rshift = stbi__high_bit(mr) - 7; + rcount = stbi__bitcount(mr); + gshift = stbi__high_bit(mg) - 7; + gcount = stbi__bitcount(mg); + bshift = stbi__high_bit(mb) - 7; + bcount = stbi__bitcount(mb); + ashift = stbi__high_bit(ma) - 7; + acount = stbi__bitcount(ma); + if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { + STBI_FREE(out); + return stbi__errpuc("bad masks", "Corrupt BMP"); + } + } + for (j = 0; j < (int)s->img_y; ++j) { + if (easy) { + for (i = 0; i < (int)s->img_x; ++i) { + unsigned char a; + out[z + 2] = stbi__get8(s); + out[z + 1] = stbi__get8(s); + out[z + 0] = stbi__get8(s); + z += 3; + a = (easy == 2 ? stbi__get8(s) : 255); + all_a |= a; + if (target == 4) + out[z++] = a; + } + } else { + int bpp = info.bpp; + for (i = 0; i < (int)s->img_x; ++i) { + stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s)); + unsigned int a; + out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount)); + out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount)); + out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount)); + a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255); + all_a |= a; + if (target == 4) + out[z++] = STBI__BYTECAST(a); + } } stbi__skip(s, pad); - } - } else { - for (j=0; j < (int) s->img_y; ++j) { - for (i=0; i < (int) s->img_x; i += 2) { - int v=stbi__get8(s),v2=0; - if (info.bpp == 4) { - v2 = v & 15; - v >>= 4; - } - out[z++] = pal[v][0]; - out[z++] = pal[v][1]; - out[z++] = pal[v][2]; - if (target == 4) out[z++] = 255; - if (i+1 == (int) s->img_x) break; - v = (info.bpp == 8) ? stbi__get8(s) : v2; - out[z++] = pal[v][0]; - out[z++] = pal[v][1]; - out[z++] = pal[v][2]; - if (target == 4) out[z++] = 255; + } + } + + // if alpha channel is all 0s, replace with all 255s + if (target == 4 && all_a == 0) + for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) + out[i] = 255; + + if (flip_vertically) { + stbi_uc t; + for (j = 0; j < (int)s->img_y >> 1; ++j) { + stbi_uc * p1 = out + j * s->img_x * target; + stbi_uc * p2 = out + (s->img_y - 1 - j) * s->img_x * target; + for (i = 0; i < (int)s->img_x * target; ++i) { + t = p1[i]; + p1[i] = p2[i]; + p2[i] = t; } - stbi__skip(s, pad); - } - } - } else { - int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0; - int z = 0; - int easy=0; - stbi__skip(s, info.offset - info.extra_read - info.hsz); - if (info.bpp == 24) width = 3 * s->img_x; - else if (info.bpp == 16) width = 2*s->img_x; - else /* bpp = 32 and pad = 0 */ width=0; - pad = (-width) & 3; - if (info.bpp == 24) { - easy = 1; - } else if (info.bpp == 32) { - if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) - easy = 2; - } - if (!easy) { - if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); } - // right shift amt to put high bit in position #7 - rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr); - gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg); - bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb); - ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma); - if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); } - } - for (j=0; j < (int) s->img_y; ++j) { - if (easy) { - for (i=0; i < (int) s->img_x; ++i) { - unsigned char a; - out[z+2] = stbi__get8(s); - out[z+1] = stbi__get8(s); - out[z+0] = stbi__get8(s); - z += 3; - a = (easy == 2 ? stbi__get8(s) : 255); - all_a |= a; - if (target == 4) out[z++] = a; - } - } else { - int bpp = info.bpp; - for (i=0; i < (int) s->img_x; ++i) { - stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s)); - unsigned int a; - out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount)); - out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount)); - out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount)); - a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255); - all_a |= a; - if (target == 4) out[z++] = STBI__BYTECAST(a); - } - } - stbi__skip(s, pad); - } - } + } + } - // if alpha channel is all 0s, replace with all 255s - if (target == 4 && all_a == 0) - for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4) - out[i] = 255; + if (req_comp && req_comp != target) { + out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y); + if (out == NULL) + return out; // stbi__convert_format frees input on failure + } - if (flip_vertically) { - stbi_uc t; - for (j=0; j < (int) s->img_y>>1; ++j) { - stbi_uc *p1 = out + j *s->img_x*target; - stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target; - for (i=0; i < (int) s->img_x*target; ++i) { - t = p1[i]; p1[i] = p2[i]; p2[i] = t; - } - } - } - - if (req_comp && req_comp != target) { - out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y); - if (out == NULL) return out; // stbi__convert_format frees input on failure - } - - *x = s->img_x; - *y = s->img_y; - if (comp) *comp = s->img_n; - return out; + *x = s->img_x; + *y = s->img_y; + if (comp) + *comp = s->img_n; + return out; } #endif @@ -5736,68 +6100,74 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req // by Jonathan Dummer #ifndef STBI_NO_TGA // returns STBI_rgb or whatever, 0 on error -static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16) -{ - // only RGB or RGBA (incl. 16bit) or grey allowed - if (is_rgb16) *is_rgb16 = 0; - switch(bits_per_pixel) { - case 8: return STBI_grey; - case 16: if(is_grey) return STBI_grey_alpha; - // fallthrough - case 15: if(is_rgb16) *is_rgb16 = 1; - return STBI_rgb; - case 24: // fallthrough - case 32: return bits_per_pixel/8; - default: return 0; - } +static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int * is_rgb16) { + // only RGB or RGBA (incl. 16bit) or grey allowed + if (is_rgb16) + *is_rgb16 = 0; + switch (bits_per_pixel) { + case 8: + return STBI_grey; + case 16: + if (is_grey) + return STBI_grey_alpha; + // fallthrough + case 15: + if (is_rgb16) + *is_rgb16 = 1; + return STBI_rgb; + case 24: // fallthrough + case 32: + return bits_per_pixel / 8; + default: + return 0; + } } -static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp) -{ +static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp) { int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp; int sz, tga_colormap_type; - stbi__get8(s); // discard Offset + stbi__get8(s); // discard Offset tga_colormap_type = stbi__get8(s); // colormap type - if( tga_colormap_type > 1 ) { + if (tga_colormap_type > 1) { stbi__rewind(s); - return 0; // only RGB or indexed allowed + return 0; // only RGB or indexed allowed } tga_image_type = stbi__get8(s); // image type - if ( tga_colormap_type == 1 ) { // colormapped (paletted) image + if (tga_colormap_type == 1) { // colormapped (paletted) image if (tga_image_type != 1 && tga_image_type != 9) { stbi__rewind(s); return 0; } - stbi__skip(s,4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) { + stbi__skip(s, 4); // skip index of first colormap entry and number of entries + sz = stbi__get8(s); // check bits per palette color entry + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) { stbi__rewind(s); return 0; } - stbi__skip(s,4); // skip image x and y origin + stbi__skip(s, 4); // skip image x and y origin tga_colormap_bpp = sz; } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE - if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) { + if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) { stbi__rewind(s); return 0; // only RGB or grey allowed, +/- RLE } - stbi__skip(s,9); // skip colormap specification and image x/y origin + stbi__skip(s, 9); // skip colormap specification and image x/y origin tga_colormap_bpp = 0; } tga_w = stbi__get16le(s); - if( tga_w < 1 ) { + if (tga_w < 1) { stbi__rewind(s); - return 0; // test width + return 0; // test width } tga_h = stbi__get16le(s); - if( tga_h < 1 ) { + if (tga_h < 1) { stbi__rewind(s); - return 0; // test height + return 0; // test height } tga_bits_per_pixel = stbi__get8(s); // bits per pixel - stbi__get8(s); // ignore alpha bits + stbi__get8(s); // ignore alpha bits if (tga_colormap_bpp != 0) { - if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) { + if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) { // when using a colormap, tga_bits_per_pixel is the size of the indexes // I don't think anything but 8 or 16bit indexes makes sense stbi__rewind(s); @@ -5807,270 +6177,268 @@ static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp) } else { tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL); } - if(!tga_comp) { - stbi__rewind(s); - return 0; + if (!tga_comp) { + stbi__rewind(s); + return 0; } - if (x) *x = tga_w; - if (y) *y = tga_h; - if (comp) *comp = tga_comp; - return 1; // seems to have passed everything + if (x) + *x = tga_w; + if (y) + *y = tga_h; + if (comp) + *comp = tga_comp; + return 1; // seems to have passed everything } -static int stbi__tga_test(stbi__context *s) -{ - int res = 0; - int sz, tga_color_type; - stbi__get8(s); // discard Offset - tga_color_type = stbi__get8(s); // color type - if ( tga_color_type > 1 ) goto errorEnd; // only RGB or indexed allowed - sz = stbi__get8(s); // image type - if ( tga_color_type == 1 ) { // colormapped (paletted) image - if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9 - stbi__skip(s,4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd; - stbi__skip(s,4); // skip image x and y origin - } else { // "normal" image w/o colormap - if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE - stbi__skip(s,9); // skip colormap specification and image x/y origin - } - if ( stbi__get16le(s) < 1 ) goto errorEnd; // test width - if ( stbi__get16le(s) < 1 ) goto errorEnd; // test height - sz = stbi__get8(s); // bits per pixel - if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index - if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd; +static int stbi__tga_test(stbi__context * s) { + int res = 0; + int sz, tga_color_type; + stbi__get8(s); // discard Offset + tga_color_type = stbi__get8(s); // color type + if (tga_color_type > 1) + goto errorEnd; // only RGB or indexed allowed + sz = stbi__get8(s); // image type + if (tga_color_type == 1) { // colormapped (paletted) image + if (sz != 1 && sz != 9) + goto errorEnd; // colortype 1 demands image type 1 or 9 + stbi__skip(s, 4); // skip index of first colormap entry and number of entries + sz = stbi__get8(s); // check bits per palette color entry + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + goto errorEnd; + stbi__skip(s, 4); // skip image x and y origin + } else { // "normal" image w/o colormap + if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) + goto errorEnd; // only RGB or grey allowed, +/- RLE + stbi__skip(s, 9); // skip colormap specification and image x/y origin + } + if (stbi__get16le(s) < 1) + goto errorEnd; // test width + if (stbi__get16le(s) < 1) + goto errorEnd; // test height + sz = stbi__get8(s); // bits per pixel + if ((tga_color_type == 1) && (sz != 8) && (sz != 16)) + goto errorEnd; // for colormapped images, bpp is size of an index + if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) + goto errorEnd; - res = 1; // if we got this far, everything's good and we can return 1 instead of 0 + res = 1; // if we got this far, everything's good and we can return 1 instead of 0 errorEnd: - stbi__rewind(s); - return res; + stbi__rewind(s); + return res; } // read 16bit value and convert to 24bit RGB -static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out) -{ - stbi__uint16 px = (stbi__uint16)stbi__get16le(s); - stbi__uint16 fiveBitMask = 31; - // we have 3 channels with 5bits each - int r = (px >> 10) & fiveBitMask; - int g = (px >> 5) & fiveBitMask; - int b = px & fiveBitMask; - // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later - out[0] = (stbi_uc)((r * 255)/31); - out[1] = (stbi_uc)((g * 255)/31); - out[2] = (stbi_uc)((b * 255)/31); +static void stbi__tga_read_rgb16(stbi__context * s, stbi_uc * out) { + stbi__uint16 px = (stbi__uint16)stbi__get16le(s); + stbi__uint16 fiveBitMask = 31; + // we have 3 channels with 5bits each + int r = (px >> 10) & fiveBitMask; + int g = (px >> 5) & fiveBitMask; + int b = px & fiveBitMask; + // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later + out[0] = (stbi_uc)((r * 255) / 31); + out[1] = (stbi_uc)((g * 255) / 31); + out[2] = (stbi_uc)((b * 255) / 31); - // some people claim that the most significant bit might be used for alpha - // (possibly if an alpha-bit is set in the "image descriptor byte") - // but that only made 16bit test images completely translucent.. - // so let's treat all 15 and 16bit TGAs as RGB with no alpha. + // some people claim that the most significant bit might be used for alpha + // (possibly if an alpha-bit is set in the "image descriptor byte") + // but that only made 16bit test images completely translucent.. + // so let's treat all 15 and 16bit TGAs as RGB with no alpha. } -static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - // read in the TGA header stuff - int tga_offset = stbi__get8(s); - int tga_indexed = stbi__get8(s); - int tga_image_type = stbi__get8(s); - int tga_is_RLE = 0; - int tga_palette_start = stbi__get16le(s); - int tga_palette_len = stbi__get16le(s); - int tga_palette_bits = stbi__get8(s); - int tga_x_origin = stbi__get16le(s); - int tga_y_origin = stbi__get16le(s); - int tga_width = stbi__get16le(s); - int tga_height = stbi__get16le(s); - int tga_bits_per_pixel = stbi__get8(s); - int tga_comp, tga_rgb16=0; - int tga_inverted = stbi__get8(s); - // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?) - // image data - unsigned char *tga_data; - unsigned char *tga_palette = NULL; - int i, j; - unsigned char raw_data[4] = {0}; - int RLE_count = 0; - int RLE_repeating = 0; - int read_next_pixel = 1; - STBI_NOTUSED(ri); - STBI_NOTUSED(tga_x_origin); // @TODO - STBI_NOTUSED(tga_y_origin); // @TODO +static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + // read in the TGA header stuff + int tga_offset = stbi__get8(s); + int tga_indexed = stbi__get8(s); + int tga_image_type = stbi__get8(s); + int tga_is_RLE = 0; + int tga_palette_start = stbi__get16le(s); + int tga_palette_len = stbi__get16le(s); + int tga_palette_bits = stbi__get8(s); + int tga_x_origin = stbi__get16le(s); + int tga_y_origin = stbi__get16le(s); + int tga_width = stbi__get16le(s); + int tga_height = stbi__get16le(s); + int tga_bits_per_pixel = stbi__get8(s); + int tga_comp, tga_rgb16 = 0; + int tga_inverted = stbi__get8(s); + // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?) + // image data + unsigned char * tga_data; + unsigned char * tga_palette = NULL; + int i, j; + unsigned char raw_data[4] = {0}; + int RLE_count = 0; + int RLE_repeating = 0; + int read_next_pixel = 1; + STBI_NOTUSED(ri); + STBI_NOTUSED(tga_x_origin); // @TODO + STBI_NOTUSED(tga_y_origin); // @TODO - if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); - if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (tga_height > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); + if (tga_width > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); - // do a tiny bit of precessing - if ( tga_image_type >= 8 ) - { - tga_image_type -= 8; - tga_is_RLE = 1; - } - tga_inverted = 1 - ((tga_inverted >> 5) & 1); + // do a tiny bit of precessing + if (tga_image_type >= 8) { + tga_image_type -= 8; + tga_is_RLE = 1; + } + tga_inverted = 1 - ((tga_inverted >> 5) & 1); - // If I'm paletted, then I'll use the number of bits from the palette - if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16); - else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16); + // If I'm paletted, then I'll use the number of bits from the palette + if (tga_indexed) + tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16); + else + tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16); - if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency - return stbi__errpuc("bad format", "Can't find out TGA pixelformat"); + if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency + return stbi__errpuc("bad format", "Can't find out TGA pixelformat"); - // tga info - *x = tga_width; - *y = tga_height; - if (comp) *comp = tga_comp; + // tga info + *x = tga_width; + *y = tga_height; + if (comp) + *comp = tga_comp; - if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) - return stbi__errpuc("too large", "Corrupt TGA"); + if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) + return stbi__errpuc("too large", "Corrupt TGA"); - tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); - if (!tga_data) return stbi__errpuc("outofmem", "Out of memory"); + tga_data = (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); + if (!tga_data) + return stbi__errpuc("outofmem", "Out of memory"); - // skip to the data's starting position (offset usually = 0) - stbi__skip(s, tga_offset ); + // skip to the data's starting position (offset usually = 0) + stbi__skip(s, tga_offset); - if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) { - for (i=0; i < tga_height; ++i) { - int row = tga_inverted ? tga_height -i - 1 : i; - stbi_uc *tga_row = tga_data + row*tga_width*tga_comp; - stbi__getn(s, tga_row, tga_width * tga_comp); - } - } else { - // do I need to load a palette? - if ( tga_indexed) - { - if (tga_palette_len == 0) { /* you have to have at least one entry! */ - STBI_FREE(tga_data); - return stbi__errpuc("bad palette", "Corrupt TGA"); - } - - // any data to skip? (offset usually = 0) - stbi__skip(s, tga_palette_start ); - // load the palette - tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0); - if (!tga_palette) { - STBI_FREE(tga_data); - return stbi__errpuc("outofmem", "Out of memory"); - } - if (tga_rgb16) { - stbi_uc *pal_entry = tga_palette; - STBI_ASSERT(tga_comp == STBI_rgb); - for (i=0; i < tga_palette_len; ++i) { - stbi__tga_read_rgb16(s, pal_entry); - pal_entry += tga_comp; + if (!tga_indexed && !tga_is_RLE && !tga_rgb16) { + for (i = 0; i < tga_height; ++i) { + int row = tga_inverted ? tga_height - i - 1 : i; + stbi_uc * tga_row = tga_data + row * tga_width * tga_comp; + stbi__getn(s, tga_row, tga_width * tga_comp); + } + } else { + // do I need to load a palette? + if (tga_indexed) { + if (tga_palette_len == 0) { /* you have to have at least one entry! */ + STBI_FREE(tga_data); + return stbi__errpuc("bad palette", "Corrupt TGA"); } - } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) { - STBI_FREE(tga_data); - STBI_FREE(tga_palette); - return stbi__errpuc("bad palette", "Corrupt TGA"); - } - } - // load the data - for (i=0; i < tga_width * tga_height; ++i) - { - // if I'm in RLE mode, do I need to get a RLE stbi__pngchunk? - if ( tga_is_RLE ) - { - if ( RLE_count == 0 ) - { - // yep, get the next byte as a RLE command - int RLE_cmd = stbi__get8(s); - RLE_count = 1 + (RLE_cmd & 127); - RLE_repeating = RLE_cmd >> 7; - read_next_pixel = 1; - } else if ( !RLE_repeating ) - { - read_next_pixel = 1; + + // any data to skip? (offset usually = 0) + stbi__skip(s, tga_palette_start); + // load the palette + tga_palette = (unsigned char *)stbi__malloc_mad2(tga_palette_len, tga_comp, 0); + if (!tga_palette) { + STBI_FREE(tga_data); + return stbi__errpuc("outofmem", "Out of memory"); } - } else - { - read_next_pixel = 1; - } - // OK, if I need to read a pixel, do it now - if ( read_next_pixel ) - { - // load however much data we did have - if ( tga_indexed ) - { - // read in index, then perform the lookup - int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s); - if ( pal_idx >= tga_palette_len ) { - // invalid index - pal_idx = 0; - } - pal_idx *= tga_comp; - for (j = 0; j < tga_comp; ++j) { - raw_data[j] = tga_palette[pal_idx+j]; - } - } else if(tga_rgb16) { - STBI_ASSERT(tga_comp == STBI_rgb); - stbi__tga_read_rgb16(s, raw_data); + if (tga_rgb16) { + stbi_uc * pal_entry = tga_palette; + STBI_ASSERT(tga_comp == STBI_rgb); + for (i = 0; i < tga_palette_len; ++i) { + stbi__tga_read_rgb16(s, pal_entry); + pal_entry += tga_comp; + } + } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) { + STBI_FREE(tga_data); + STBI_FREE(tga_palette); + return stbi__errpuc("bad palette", "Corrupt TGA"); + } + } + // load the data + for (i = 0; i < tga_width * tga_height; ++i) { + // if I'm in RLE mode, do I need to get a RLE stbi__pngchunk? + if (tga_is_RLE) { + if (RLE_count == 0) { + // yep, get the next byte as a RLE command + int RLE_cmd = stbi__get8(s); + RLE_count = 1 + (RLE_cmd & 127); + RLE_repeating = RLE_cmd >> 7; + read_next_pixel = 1; + } else if (!RLE_repeating) { + read_next_pixel = 1; + } } else { - // read in the data raw - for (j = 0; j < tga_comp; ++j) { - raw_data[j] = stbi__get8(s); - } + read_next_pixel = 1; } - // clear the reading flag for the next pixel - read_next_pixel = 0; - } // end of reading a pixel + // OK, if I need to read a pixel, do it now + if (read_next_pixel) { + // load however much data we did have + if (tga_indexed) { + // read in index, then perform the lookup + int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s); + if (pal_idx >= tga_palette_len) { + // invalid index + pal_idx = 0; + } + pal_idx *= tga_comp; + for (j = 0; j < tga_comp; ++j) { + raw_data[j] = tga_palette[pal_idx + j]; + } + } else if (tga_rgb16) { + STBI_ASSERT(tga_comp == STBI_rgb); + stbi__tga_read_rgb16(s, raw_data); + } else { + // read in the data raw + for (j = 0; j < tga_comp; ++j) { + raw_data[j] = stbi__get8(s); + } + } + // clear the reading flag for the next pixel + read_next_pixel = 0; + } // end of reading a pixel - // copy data - for (j = 0; j < tga_comp; ++j) - tga_data[i*tga_comp+j] = raw_data[j]; + // copy data + for (j = 0; j < tga_comp; ++j) + tga_data[i * tga_comp + j] = raw_data[j]; - // in case we're in RLE mode, keep counting down - --RLE_count; - } - // do I need to invert the image? - if ( tga_inverted ) - { - for (j = 0; j*2 < tga_height; ++j) - { - int index1 = j * tga_width * tga_comp; - int index2 = (tga_height - 1 - j) * tga_width * tga_comp; - for (i = tga_width * tga_comp; i > 0; --i) - { - unsigned char temp = tga_data[index1]; - tga_data[index1] = tga_data[index2]; - tga_data[index2] = temp; - ++index1; - ++index2; + // in case we're in RLE mode, keep counting down + --RLE_count; + } + // do I need to invert the image? + if (tga_inverted) { + for (j = 0; j * 2 < tga_height; ++j) { + int index1 = j * tga_width * tga_comp; + int index2 = (tga_height - 1 - j) * tga_width * tga_comp; + for (i = tga_width * tga_comp; i > 0; --i) { + unsigned char temp = tga_data[index1]; + tga_data[index1] = tga_data[index2]; + tga_data[index2] = temp; + ++index1; + ++index2; + } } - } - } - // clear my palette, if I had one - if ( tga_palette != NULL ) - { - STBI_FREE( tga_palette ); - } - } + } + // clear my palette, if I had one + if (tga_palette != NULL) { + STBI_FREE(tga_palette); + } + } - // swap RGB - if the source data was RGB16, it already is in the right order - if (tga_comp >= 3 && !tga_rgb16) - { - unsigned char* tga_pixel = tga_data; - for (i=0; i < tga_width * tga_height; ++i) - { - unsigned char temp = tga_pixel[0]; - tga_pixel[0] = tga_pixel[2]; - tga_pixel[2] = temp; - tga_pixel += tga_comp; - } - } + // swap RGB - if the source data was RGB16, it already is in the right order + if (tga_comp >= 3 && !tga_rgb16) { + unsigned char * tga_pixel = tga_data; + for (i = 0; i < tga_width * tga_height; ++i) { + unsigned char temp = tga_pixel[0]; + tga_pixel[0] = tga_pixel[2]; + tga_pixel[2] = temp; + tga_pixel += tga_comp; + } + } - // convert to target component count - if (req_comp && req_comp != tga_comp) - tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height); + // convert to target component count + if (req_comp && req_comp != tga_comp) + tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height); - // the things I do to get rid of an error message, and yet keep - // Microsoft's C compilers happy... [8^( - tga_palette_start = tga_palette_len = tga_palette_bits = - tga_x_origin = tga_y_origin = 0; - STBI_NOTUSED(tga_palette_start); - // OK, done - return tga_data; + // the things I do to get rid of an error message, and yet keep + // Microsoft's C compilers happy... [8^( + tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin = tga_y_origin = 0; + STBI_NOTUSED(tga_palette_start); + // OK, done + return tga_data; } #endif @@ -6078,250 +6446,253 @@ static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req // Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB #ifndef STBI_NO_PSD -static int stbi__psd_test(stbi__context *s) -{ - int r = (stbi__get32be(s) == 0x38425053); - stbi__rewind(s); - return r; +static int stbi__psd_test(stbi__context * s) { + int r = (stbi__get32be(s) == 0x38425053); + stbi__rewind(s); + return r; } -static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount) -{ - int count, nleft, len; +static int stbi__psd_decode_rle(stbi__context * s, stbi_uc * p, int pixelCount) { + int count, nleft, len; - count = 0; - while ((nleft = pixelCount - count) > 0) { - len = stbi__get8(s); - if (len == 128) { - // No-op. - } else if (len < 128) { - // Copy next len+1 bytes literally. - len++; - if (len > nleft) return 0; // corrupt data - count += len; - while (len) { - *p = stbi__get8(s); - p += 4; - len--; - } - } else if (len > 128) { - stbi_uc val; - // Next -len+1 bytes in the dest are replicated from next source byte. - // (Interpret len as a negative 8-bit int.) - len = 257 - len; - if (len > nleft) return 0; // corrupt data - val = stbi__get8(s); - count += len; - while (len) { - *p = val; - p += 4; - len--; - } - } - } + count = 0; + while ((nleft = pixelCount - count) > 0) { + len = stbi__get8(s); + if (len == 128) { + // No-op. + } else if (len < 128) { + // Copy next len+1 bytes literally. + len++; + if (len > nleft) + return 0; // corrupt data + count += len; + while (len) { + *p = stbi__get8(s); + p += 4; + len--; + } + } else if (len > 128) { + stbi_uc val; + // Next -len+1 bytes in the dest are replicated from next source byte. + // (Interpret len as a negative 8-bit int.) + len = 257 - len; + if (len > nleft) + return 0; // corrupt data + val = stbi__get8(s); + count += len; + while (len) { + *p = val; + p += 4; + len--; + } + } + } - return 1; + return 1; } -static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc) -{ - int pixelCount; - int channelCount, compression; - int channel, i; - int bitdepth; - int w,h; - stbi_uc *out; - STBI_NOTUSED(ri); +static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc) { + int pixelCount; + int channelCount, compression; + int channel, i; + int bitdepth; + int w, h; + stbi_uc * out; + STBI_NOTUSED(ri); - // Check identifier - if (stbi__get32be(s) != 0x38425053) // "8BPS" - return stbi__errpuc("not PSD", "Corrupt PSD image"); + // Check identifier + if (stbi__get32be(s) != 0x38425053) // "8BPS" + return stbi__errpuc("not PSD", "Corrupt PSD image"); - // Check file type version. - if (stbi__get16be(s) != 1) - return stbi__errpuc("wrong version", "Unsupported version of PSD image"); + // Check file type version. + if (stbi__get16be(s) != 1) + return stbi__errpuc("wrong version", "Unsupported version of PSD image"); - // Skip 6 reserved bytes. - stbi__skip(s, 6 ); + // Skip 6 reserved bytes. + stbi__skip(s, 6); - // Read the number of channels (R, G, B, A, etc). - channelCount = stbi__get16be(s); - if (channelCount < 0 || channelCount > 16) - return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image"); + // Read the number of channels (R, G, B, A, etc). + channelCount = stbi__get16be(s); + if (channelCount < 0 || channelCount > 16) + return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image"); - // Read the rows and columns of the image. - h = stbi__get32be(s); - w = stbi__get32be(s); + // Read the rows and columns of the image. + h = stbi__get32be(s); + w = stbi__get32be(s); - if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); - if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (h > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); + if (w > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); - // Make sure the depth is 8 bits. - bitdepth = stbi__get16be(s); - if (bitdepth != 8 && bitdepth != 16) - return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit"); + // Make sure the depth is 8 bits. + bitdepth = stbi__get16be(s); + if (bitdepth != 8 && bitdepth != 16) + return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit"); - // Make sure the color mode is RGB. - // Valid options are: - // 0: Bitmap - // 1: Grayscale - // 2: Indexed color - // 3: RGB color - // 4: CMYK color - // 7: Multichannel - // 8: Duotone - // 9: Lab color - if (stbi__get16be(s) != 3) - return stbi__errpuc("wrong color format", "PSD is not in RGB color format"); + // Make sure the color mode is RGB. + // Valid options are: + // 0: Bitmap + // 1: Grayscale + // 2: Indexed color + // 3: RGB color + // 4: CMYK color + // 7: Multichannel + // 8: Duotone + // 9: Lab color + if (stbi__get16be(s) != 3) + return stbi__errpuc("wrong color format", "PSD is not in RGB color format"); - // Skip the Mode Data. (It's the palette for indexed color; other info for other modes.) - stbi__skip(s,stbi__get32be(s) ); + // Skip the Mode Data. (It's the palette for indexed color; other info for other modes.) + stbi__skip(s, stbi__get32be(s)); - // Skip the image resources. (resolution, pen tool paths, etc) - stbi__skip(s, stbi__get32be(s) ); + // Skip the image resources. (resolution, pen tool paths, etc) + stbi__skip(s, stbi__get32be(s)); - // Skip the reserved data. - stbi__skip(s, stbi__get32be(s) ); + // Skip the reserved data. + stbi__skip(s, stbi__get32be(s)); - // Find out if the data is compressed. - // Known values: - // 0: no compression - // 1: RLE compressed - compression = stbi__get16be(s); - if (compression > 1) - return stbi__errpuc("bad compression", "PSD has an unknown compression format"); + // Find out if the data is compressed. + // Known values: + // 0: no compression + // 1: RLE compressed + compression = stbi__get16be(s); + if (compression > 1) + return stbi__errpuc("bad compression", "PSD has an unknown compression format"); - // Check size - if (!stbi__mad3sizes_valid(4, w, h, 0)) - return stbi__errpuc("too large", "Corrupt PSD"); + // Check size + if (!stbi__mad3sizes_valid(4, w, h, 0)) + return stbi__errpuc("too large", "Corrupt PSD"); - // Create the destination image. + // Create the destination image. - if (!compression && bitdepth == 16 && bpc == 16) { - out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0); - ri->bits_per_channel = 16; - } else - out = (stbi_uc *) stbi__malloc(4 * w*h); + if (!compression && bitdepth == 16 && bpc == 16) { + out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0); + ri->bits_per_channel = 16; + } else + out = (stbi_uc *)stbi__malloc(4 * w * h); - if (!out) return stbi__errpuc("outofmem", "Out of memory"); - pixelCount = w*h; + if (!out) + return stbi__errpuc("outofmem", "Out of memory"); + pixelCount = w * h; - // Initialize the data to zero. - //memset( out, 0, pixelCount * 4 ); + // Initialize the data to zero. + // memset( out, 0, pixelCount * 4 ); - // Finally, the image data. - if (compression) { - // RLE as used by .PSD and .TIFF - // Loop until you get the number of unpacked bytes you are expecting: - // Read the next source byte into n. - // If n is between 0 and 127 inclusive, copy the next n+1 bytes literally. - // Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times. - // Else if n is 128, noop. - // Endloop + // Finally, the image data. + if (compression) { + // RLE as used by .PSD and .TIFF + // Loop until you get the number of unpacked bytes you are expecting: + // Read the next source byte into n. + // If n is between 0 and 127 inclusive, copy the next n+1 bytes literally. + // Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times. + // Else if n is 128, noop. + // Endloop - // The RLE-compressed data is preceded by a 2-byte data count for each row in the data, - // which we're going to just skip. - stbi__skip(s, h * channelCount * 2 ); + // The RLE-compressed data is preceded by a 2-byte data count for each row in the data, + // which we're going to just skip. + stbi__skip(s, h * channelCount * 2); - // Read the RLE data by channel. - for (channel = 0; channel < 4; channel++) { - stbi_uc *p; + // Read the RLE data by channel. + for (channel = 0; channel < 4; channel++) { + stbi_uc * p; - p = out+channel; - if (channel >= channelCount) { - // Fill this channel with default data. - for (i = 0; i < pixelCount; i++, p += 4) - *p = (channel == 3 ? 255 : 0); - } else { - // Read the RLE data. - if (!stbi__psd_decode_rle(s, p, pixelCount)) { - STBI_FREE(out); - return stbi__errpuc("corrupt", "bad RLE data"); - } - } - } - - } else { - // We're at the raw image data. It's each channel in order (Red, Green, Blue, Alpha, ...) - // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image. - - // Read the data by channel. - for (channel = 0; channel < 4; channel++) { - if (channel >= channelCount) { - // Fill this channel with default data. - if (bitdepth == 16 && bpc == 16) { - stbi__uint16 *q = ((stbi__uint16 *) out) + channel; - stbi__uint16 val = channel == 3 ? 65535 : 0; - for (i = 0; i < pixelCount; i++, q += 4) - *q = val; + p = out + channel; + if (channel >= channelCount) { + // Fill this channel with default data. + for (i = 0; i < pixelCount; i++, p += 4) + *p = (channel == 3 ? 255 : 0); } else { - stbi_uc *p = out+channel; - stbi_uc val = channel == 3 ? 255 : 0; - for (i = 0; i < pixelCount; i++, p += 4) - *p = val; + // Read the RLE data. + if (!stbi__psd_decode_rle(s, p, pixelCount)) { + STBI_FREE(out); + return stbi__errpuc("corrupt", "bad RLE data"); + } } - } else { - if (ri->bits_per_channel == 16) { // output bpc - stbi__uint16 *q = ((stbi__uint16 *) out) + channel; - for (i = 0; i < pixelCount; i++, q += 4) - *q = (stbi__uint16) stbi__get16be(s); + } + } else { + // We're at the raw image data. It's each channel in order (Red, Green, Blue, Alpha, ...) + // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image. + + // Read the data by channel. + for (channel = 0; channel < 4; channel++) { + if (channel >= channelCount) { + // Fill this channel with default data. + if (bitdepth == 16 && bpc == 16) { + stbi__uint16 * q = ((stbi__uint16 *)out) + channel; + stbi__uint16 val = channel == 3 ? 65535 : 0; + for (i = 0; i < pixelCount; i++, q += 4) + *q = val; + } else { + stbi_uc * p = out + channel; + stbi_uc val = channel == 3 ? 255 : 0; + for (i = 0; i < pixelCount; i++, p += 4) + *p = val; + } } else { - stbi_uc *p = out+channel; - if (bitdepth == 16) { // input bpc - for (i = 0; i < pixelCount; i++, p += 4) - *p = (stbi_uc) (stbi__get16be(s) >> 8); - } else { - for (i = 0; i < pixelCount; i++, p += 4) - *p = stbi__get8(s); - } + if (ri->bits_per_channel == 16) { // output bpc + stbi__uint16 * q = ((stbi__uint16 *)out) + channel; + for (i = 0; i < pixelCount; i++, q += 4) + *q = (stbi__uint16)stbi__get16be(s); + } else { + stbi_uc * p = out + channel; + if (bitdepth == 16) { // input bpc + for (i = 0; i < pixelCount; i++, p += 4) + *p = (stbi_uc)(stbi__get16be(s) >> 8); + } else { + for (i = 0; i < pixelCount; i++, p += 4) + *p = stbi__get8(s); + } + } } - } - } - } + } + } - // remove weird white matte from PSD - if (channelCount >= 4) { - if (ri->bits_per_channel == 16) { - for (i=0; i < w*h; ++i) { - stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i; - if (pixel[3] != 0 && pixel[3] != 65535) { - float a = pixel[3] / 65535.0f; - float ra = 1.0f / a; - float inv_a = 65535.0f * (1 - ra); - pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a); - pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a); - pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a); + // remove weird white matte from PSD + if (channelCount >= 4) { + if (ri->bits_per_channel == 16) { + for (i = 0; i < w * h; ++i) { + stbi__uint16 * pixel = (stbi__uint16 *)out + 4 * i; + if (pixel[3] != 0 && pixel[3] != 65535) { + float a = pixel[3] / 65535.0f; + float ra = 1.0f / a; + float inv_a = 65535.0f * (1 - ra); + pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a); + pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a); + pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a); + } } - } - } else { - for (i=0; i < w*h; ++i) { - unsigned char *pixel = out + 4*i; - if (pixel[3] != 0 && pixel[3] != 255) { - float a = pixel[3] / 255.0f; - float ra = 1.0f / a; - float inv_a = 255.0f * (1 - ra); - pixel[0] = (unsigned char) (pixel[0]*ra + inv_a); - pixel[1] = (unsigned char) (pixel[1]*ra + inv_a); - pixel[2] = (unsigned char) (pixel[2]*ra + inv_a); + } else { + for (i = 0; i < w * h; ++i) { + unsigned char * pixel = out + 4 * i; + if (pixel[3] != 0 && pixel[3] != 255) { + float a = pixel[3] / 255.0f; + float ra = 1.0f / a; + float inv_a = 255.0f * (1 - ra); + pixel[0] = (unsigned char)(pixel[0] * ra + inv_a); + pixel[1] = (unsigned char)(pixel[1] * ra + inv_a); + pixel[2] = (unsigned char)(pixel[2] * ra + inv_a); + } } - } - } - } + } + } - // convert to desired output format - if (req_comp && req_comp != 4) { - if (ri->bits_per_channel == 16) - out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h); - else - out = stbi__convert_format(out, 4, req_comp, w, h); - if (out == NULL) return out; // stbi__convert_format frees input on failure - } + // convert to desired output format + if (req_comp && req_comp != 4) { + if (ri->bits_per_channel == 16) + out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4, req_comp, w, h); + else + out = stbi__convert_format(out, 4, req_comp, w, h); + if (out == NULL) + return out; // stbi__convert_format frees input on failure + } - if (comp) *comp = 4; - *y = h; - *x = w; + if (comp) + *comp = 4; + *y = h; + *x = w; - return out; + return out; } #endif @@ -6333,216 +6704,221 @@ static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req // See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/ #ifndef STBI_NO_PIC -static int stbi__pic_is4(stbi__context *s,const char *str) -{ - int i; - for (i=0; i<4; ++i) - if (stbi__get8(s) != (stbi_uc)str[i]) - return 0; +static int stbi__pic_is4(stbi__context * s, const char * str) { + int i; + for (i = 0; i < 4; ++i) + if (stbi__get8(s) != (stbi_uc)str[i]) + return 0; - return 1; + return 1; } -static int stbi__pic_test_core(stbi__context *s) -{ - int i; +static int stbi__pic_test_core(stbi__context * s) { + int i; - if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) - return 0; + if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) + return 0; - for(i=0;i<84;++i) - stbi__get8(s); + for (i = 0; i < 84; ++i) + stbi__get8(s); - if (!stbi__pic_is4(s,"PICT")) - return 0; + if (!stbi__pic_is4(s, "PICT")) + return 0; - return 1; + return 1; } -typedef struct -{ - stbi_uc size,type,channel; +typedef struct { + stbi_uc size, type, channel; } stbi__pic_packet; -static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest) -{ - int mask=0x80, i; +static stbi_uc * stbi__readval(stbi__context * s, int channel, stbi_uc * dest) { + int mask = 0x80, i; - for (i=0; i<4; ++i, mask>>=1) { - if (channel & mask) { - if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short"); - dest[i]=stbi__get8(s); - } - } + for (i = 0; i < 4; ++i, mask >>= 1) { + if (channel & mask) { + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "PIC file too short"); + dest[i] = stbi__get8(s); + } + } - return dest; + return dest; } -static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src) -{ - int mask=0x80,i; +static void stbi__copyval(int channel, stbi_uc * dest, const stbi_uc * src) { + int mask = 0x80, i; - for (i=0;i<4; ++i, mask>>=1) - if (channel&mask) - dest[i]=src[i]; + for (i = 0; i < 4; ++i, mask >>= 1) + if (channel & mask) + dest[i] = src[i]; } -static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result) -{ - int act_comp=0,num_packets=0,y,chained; - stbi__pic_packet packets[10]; +static stbi_uc * stbi__pic_load_core(stbi__context * s, int width, int height, int * comp, stbi_uc * result) { + int act_comp = 0, num_packets = 0, y, chained; + stbi__pic_packet packets[10]; - // this will (should...) cater for even some bizarre stuff like having data + // this will (should...) cater for even some bizarre stuff like having data // for the same channel in multiple packets. - do { - stbi__pic_packet *packet; + do { + stbi__pic_packet * packet; - if (num_packets==sizeof(packets)/sizeof(packets[0])) - return stbi__errpuc("bad format","too many packets"); + if (num_packets == sizeof(packets) / sizeof(packets[0])) + return stbi__errpuc("bad format", "too many packets"); - packet = &packets[num_packets++]; + packet = &packets[num_packets++]; - chained = stbi__get8(s); - packet->size = stbi__get8(s); - packet->type = stbi__get8(s); - packet->channel = stbi__get8(s); + chained = stbi__get8(s); + packet->size = stbi__get8(s); + packet->type = stbi__get8(s); + packet->channel = stbi__get8(s); - act_comp |= packet->channel; + act_comp |= packet->channel; - if (stbi__at_eof(s)) return stbi__errpuc("bad file","file too short (reading packets)"); - if (packet->size != 8) return stbi__errpuc("bad format","packet isn't 8bpp"); - } while (chained); + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (reading packets)"); + if (packet->size != 8) + return stbi__errpuc("bad format", "packet isn't 8bpp"); + } while (chained); - *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? + *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? - for(y=0; ytype) { + switch (packet->type) { default: - return stbi__errpuc("bad format","packet has bad compression type"); + return stbi__errpuc("bad format", "packet has bad compression type"); - case 0: {//uncompressed - int x; + case 0: { // uncompressed + int x; - for(x=0;xchannel,dest)) - return 0; - break; + for (x = 0; x < width; ++x, dest += 4) + if (!stbi__readval(s, packet->channel, dest)) + return 0; + break; } - case 1://Pure RLE - { - int left=width, i; + case 1: // Pure RLE + { + int left = width, i; - while (left>0) { - stbi_uc count,value[4]; + while (left > 0) { + stbi_uc count, value[4]; - count=stbi__get8(s); - if (stbi__at_eof(s)) return stbi__errpuc("bad file","file too short (pure read count)"); + count = stbi__get8(s); + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (pure read count)"); - if (count > left) - count = (stbi_uc) left; + if (count > left) + count = (stbi_uc)left; - if (!stbi__readval(s,packet->channel,value)) return 0; - - for(i=0; ichannel,dest,value); - left -= count; - } - } - break; - - case 2: {//Mixed RLE - int left=width; - while (left>0) { - int count = stbi__get8(s), i; - if (stbi__at_eof(s)) return stbi__errpuc("bad file","file too short (mixed read count)"); - - if (count >= 128) { // Repeated - stbi_uc value[4]; - - if (count==128) - count = stbi__get16be(s); - else - count -= 127; - if (count > left) - return stbi__errpuc("bad file","scanline overrun"); - - if (!stbi__readval(s,packet->channel,value)) + if (!stbi__readval(s, packet->channel, value)) return 0; - for(i=0;ichannel,dest,value); - } else { // Raw - ++count; - if (count>left) return stbi__errpuc("bad file","scanline overrun"); + for (i = 0; i < count; ++i, dest += 4) + stbi__copyval(packet->channel, dest, value); + left -= count; + } + } break; - for(i=0;ichannel,dest)) - return 0; - } - left-=count; - } - break; + case 2: { // Mixed RLE + int left = width; + while (left > 0) { + int count = stbi__get8(s), i; + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (mixed read count)"); + + if (count >= 128) { // Repeated + stbi_uc value[4]; + + if (count == 128) + count = stbi__get16be(s); + else + count -= 127; + if (count > left) + return stbi__errpuc("bad file", "scanline overrun"); + + if (!stbi__readval(s, packet->channel, value)) + return 0; + + for (i = 0; i < count; ++i, dest += 4) + stbi__copyval(packet->channel, dest, value); + } else { // Raw + ++count; + if (count > left) + return stbi__errpuc("bad file", "scanline overrun"); + + for (i = 0; i < count; ++i, dest += 4) + if (!stbi__readval(s, packet->channel, dest)) + return 0; + } + left -= count; + } + break; } - } - } - } + } + } + } - return result; + return result; } -static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri) -{ - stbi_uc *result; - int i, x,y, internal_comp; - STBI_NOTUSED(ri); +static void * stbi__pic_load(stbi__context * s, int * px, int * py, int * comp, int req_comp, stbi__result_info * ri) { + stbi_uc * result; + int i, x, y, internal_comp; + STBI_NOTUSED(ri); - if (!comp) comp = &internal_comp; + if (!comp) + comp = &internal_comp; - for (i=0; i<92; ++i) - stbi__get8(s); + for (i = 0; i < 92; ++i) + stbi__get8(s); - x = stbi__get16be(s); - y = stbi__get16be(s); + x = stbi__get16be(s); + y = stbi__get16be(s); - if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); - if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (y > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); + if (x > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); - if (stbi__at_eof(s)) return stbi__errpuc("bad file","file too short (pic header)"); - if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode"); + if (stbi__at_eof(s)) + return stbi__errpuc("bad file", "file too short (pic header)"); + if (!stbi__mad3sizes_valid(x, y, 4, 0)) + return stbi__errpuc("too large", "PIC image too large to decode"); - stbi__get32be(s); //skip `ratio' - stbi__get16be(s); //skip `fields' - stbi__get16be(s); //skip `pad' + stbi__get32be(s); // skip `ratio' + stbi__get16be(s); // skip `fields' + stbi__get16be(s); // skip `pad' - // intermediate buffer is RGBA - result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0); - if (!result) return stbi__errpuc("outofmem", "Out of memory"); - memset(result, 0xff, x*y*4); + // intermediate buffer is RGBA + result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0); + if (!result) + return stbi__errpuc("outofmem", "Out of memory"); + memset(result, 0xff, x * y * 4); - if (!stbi__pic_load_core(s,x,y,comp, result)) { - STBI_FREE(result); - result=0; - } - *px = x; - *py = y; - if (req_comp == 0) req_comp = *comp; - result=stbi__convert_format(result,4,req_comp,x,y); + if (!stbi__pic_load_core(s, x, y, comp, result)) { + STBI_FREE(result); + result = 0; + } + *px = x; + *py = y; + if (req_comp == 0) + req_comp = *comp; + result = stbi__convert_format(result, 4, req_comp, x, y); - return result; + return result; } -static int stbi__pic_test(stbi__context *s) -{ - int r = stbi__pic_test_core(s); - stbi__rewind(s); - return r; +static int stbi__pic_test(stbi__context * s) { + int r = stbi__pic_test_core(s); + stbi__rewind(s); + return r; } #endif @@ -6550,931 +6926,968 @@ static int stbi__pic_test(stbi__context *s) // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb #ifndef STBI_NO_GIF -typedef struct -{ - stbi__int16 prefix; - stbi_uc first; - stbi_uc suffix; +typedef struct { + stbi__int16 prefix; + stbi_uc first; + stbi_uc suffix; } stbi__gif_lzw; -typedef struct -{ - int w,h; - stbi_uc *out; // output buffer (always 4 components) - stbi_uc *background; // The current "background" as far as a gif is concerned - stbi_uc *history; - int flags, bgindex, ratio, transparent, eflags; - stbi_uc pal[256][4]; - stbi_uc lpal[256][4]; - stbi__gif_lzw codes[8192]; - stbi_uc *color_table; - int parse, step; - int lflags; - int start_x, start_y; - int max_x, max_y; - int cur_x, cur_y; - int line_size; - int delay; +typedef struct { + int w, h; + stbi_uc * out; // output buffer (always 4 components) + stbi_uc * background; // The current "background" as far as a gif is concerned + stbi_uc * history; + int flags, bgindex, ratio, transparent, eflags; + stbi_uc pal[256][4]; + stbi_uc lpal[256][4]; + stbi__gif_lzw codes[8192]; + stbi_uc * color_table; + int parse, step; + int lflags; + int start_x, start_y; + int max_x, max_y; + int cur_x, cur_y; + int line_size; + int delay; } stbi__gif; -static int stbi__gif_test_raw(stbi__context *s) -{ - int sz; - if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0; - sz = stbi__get8(s); - if (sz != '9' && sz != '7') return 0; - if (stbi__get8(s) != 'a') return 0; - return 1; +static int stbi__gif_test_raw(stbi__context * s) { + int sz; + if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') + return 0; + sz = stbi__get8(s); + if (sz != '9' && sz != '7') + return 0; + if (stbi__get8(s) != 'a') + return 0; + return 1; } -static int stbi__gif_test(stbi__context *s) -{ - int r = stbi__gif_test_raw(s); - stbi__rewind(s); - return r; +static int stbi__gif_test(stbi__context * s) { + int r = stbi__gif_test_raw(s); + stbi__rewind(s); + return r; } -static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp) -{ - int i; - for (i=0; i < num_entries; ++i) { - pal[i][2] = stbi__get8(s); - pal[i][1] = stbi__get8(s); - pal[i][0] = stbi__get8(s); - pal[i][3] = transp == i ? 0 : 255; - } +static void stbi__gif_parse_colortable(stbi__context * s, stbi_uc pal[256][4], int num_entries, int transp) { + int i; + for (i = 0; i < num_entries; ++i) { + pal[i][2] = stbi__get8(s); + pal[i][1] = stbi__get8(s); + pal[i][0] = stbi__get8(s); + pal[i][3] = transp == i ? 0 : 255; + } } -static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info) -{ - stbi_uc version; - if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') - return stbi__err("not GIF", "Corrupt GIF"); +static int stbi__gif_header(stbi__context * s, stbi__gif * g, int * comp, int is_info) { + stbi_uc version; + if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') + return stbi__err("not GIF", "Corrupt GIF"); - version = stbi__get8(s); - if (version != '7' && version != '9') return stbi__err("not GIF", "Corrupt GIF"); - if (stbi__get8(s) != 'a') return stbi__err("not GIF", "Corrupt GIF"); + version = stbi__get8(s); + if (version != '7' && version != '9') + return stbi__err("not GIF", "Corrupt GIF"); + if (stbi__get8(s) != 'a') + return stbi__err("not GIF", "Corrupt GIF"); - stbi__g_failure_reason = ""; - g->w = stbi__get16le(s); - g->h = stbi__get16le(s); - g->flags = stbi__get8(s); - g->bgindex = stbi__get8(s); - g->ratio = stbi__get8(s); - g->transparent = -1; + stbi__g_failure_reason = ""; + g->w = stbi__get16le(s); + g->h = stbi__get16le(s); + g->flags = stbi__get8(s); + g->bgindex = stbi__get8(s); + g->ratio = stbi__get8(s); + g->transparent = -1; - if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); - if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); + if (g->w > STBI_MAX_DIMENSIONS) + return stbi__err("too large", "Very large image (corrupt?)"); + if (g->h > STBI_MAX_DIMENSIONS) + return stbi__err("too large", "Very large image (corrupt?)"); - if (comp != 0) *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments + if (comp != 0) + *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments - if (is_info) return 1; + if (is_info) + return 1; - if (g->flags & 0x80) - stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1); + if (g->flags & 0x80) + stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1); - return 1; + return 1; } -static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) -{ - stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif)); - if (!g) return stbi__err("outofmem", "Out of memory"); - if (!stbi__gif_header(s, g, comp, 1)) { - STBI_FREE(g); - stbi__rewind( s ); - return 0; - } - if (x) *x = g->w; - if (y) *y = g->h; - STBI_FREE(g); - return 1; +static int stbi__gif_info_raw(stbi__context * s, int * x, int * y, int * comp) { + stbi__gif * g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif)); + if (!g) + return stbi__err("outofmem", "Out of memory"); + if (!stbi__gif_header(s, g, comp, 1)) { + STBI_FREE(g); + stbi__rewind(s); + return 0; + } + if (x) + *x = g->w; + if (y) + *y = g->h; + STBI_FREE(g); + return 1; } -static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code) -{ - stbi_uc *p, *c; - int idx; +static void stbi__out_gif_code(stbi__gif * g, stbi__uint16 code) { + stbi_uc *p, *c; + int idx; - // recurse to decode the prefixes, since the linked-list is backwards, - // and working backwards through an interleaved image would be nasty - if (g->codes[code].prefix >= 0) - stbi__out_gif_code(g, g->codes[code].prefix); + // recurse to decode the prefixes, since the linked-list is backwards, + // and working backwards through an interleaved image would be nasty + if (g->codes[code].prefix >= 0) + stbi__out_gif_code(g, g->codes[code].prefix); - if (g->cur_y >= g->max_y) return; + if (g->cur_y >= g->max_y) + return; - idx = g->cur_x + g->cur_y; - p = &g->out[idx]; - g->history[idx / 4] = 1; + idx = g->cur_x + g->cur_y; + p = &g->out[idx]; + g->history[idx / 4] = 1; - c = &g->color_table[g->codes[code].suffix * 4]; - if (c[3] > 128) { // don't render transparent pixels; - p[0] = c[2]; - p[1] = c[1]; - p[2] = c[0]; - p[3] = c[3]; - } - g->cur_x += 4; + c = &g->color_table[g->codes[code].suffix * 4]; + if (c[3] > 128) { // don't render transparent pixels; + p[0] = c[2]; + p[1] = c[1]; + p[2] = c[0]; + p[3] = c[3]; + } + g->cur_x += 4; - if (g->cur_x >= g->max_x) { - g->cur_x = g->start_x; - g->cur_y += g->step; + if (g->cur_x >= g->max_x) { + g->cur_x = g->start_x; + g->cur_y += g->step; - while (g->cur_y >= g->max_y && g->parse > 0) { - g->step = (1 << g->parse) * g->line_size; - g->cur_y = g->start_y + (g->step >> 1); - --g->parse; - } - } + while (g->cur_y >= g->max_y && g->parse > 0) { + g->step = (1 << g->parse) * g->line_size; + g->cur_y = g->start_y + (g->step >> 1); + --g->parse; + } + } } -static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g) -{ - stbi_uc lzw_cs; - stbi__int32 len, init_code; - stbi__uint32 first; - stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear; - stbi__gif_lzw *p; +static stbi_uc * stbi__process_gif_raster(stbi__context * s, stbi__gif * g) { + stbi_uc lzw_cs; + stbi__int32 len, init_code; + stbi__uint32 first; + stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear; + stbi__gif_lzw * p; - lzw_cs = stbi__get8(s); - if (lzw_cs > 12) return NULL; - clear = 1 << lzw_cs; - first = 1; - codesize = lzw_cs + 1; - codemask = (1 << codesize) - 1; - bits = 0; - valid_bits = 0; - for (init_code = 0; init_code < clear; init_code++) { - g->codes[init_code].prefix = -1; - g->codes[init_code].first = (stbi_uc) init_code; - g->codes[init_code].suffix = (stbi_uc) init_code; - } + lzw_cs = stbi__get8(s); + if (lzw_cs > 12) + return NULL; + clear = 1 << lzw_cs; + first = 1; + codesize = lzw_cs + 1; + codemask = (1 << codesize) - 1; + bits = 0; + valid_bits = 0; + for (init_code = 0; init_code < clear; init_code++) { + g->codes[init_code].prefix = -1; + g->codes[init_code].first = (stbi_uc)init_code; + g->codes[init_code].suffix = (stbi_uc)init_code; + } - // support no starting clear code - avail = clear+2; - oldcode = -1; + // support no starting clear code + avail = clear + 2; + oldcode = -1; - len = 0; - for(;;) { - if (valid_bits < codesize) { - if (len == 0) { - len = stbi__get8(s); // start new block - if (len == 0) - return g->out; - } - --len; - bits |= (stbi__int32) stbi__get8(s) << valid_bits; - valid_bits += 8; - } else { - stbi__int32 code = bits & codemask; - bits >>= codesize; - valid_bits -= codesize; - // @OPTIMIZE: is there some way we can accelerate the non-clear path? - if (code == clear) { // clear code - codesize = lzw_cs + 1; - codemask = (1 << codesize) - 1; - avail = clear + 2; - oldcode = -1; - first = 0; - } else if (code == clear + 1) { // end of stream code - stbi__skip(s, len); - while ((len = stbi__get8(s)) > 0) - stbi__skip(s,len); - return g->out; - } else if (code <= avail) { - if (first) { - return stbi__errpuc("no clear code", "Corrupt GIF"); + len = 0; + for (;;) { + if (valid_bits < codesize) { + if (len == 0) { + len = stbi__get8(s); // start new block + if (len == 0) + return g->out; } + --len; + bits |= (stbi__int32)stbi__get8(s) << valid_bits; + valid_bits += 8; + } else { + stbi__int32 code = bits & codemask; + bits >>= codesize; + valid_bits -= codesize; + // @OPTIMIZE: is there some way we can accelerate the non-clear path? + if (code == clear) { // clear code + codesize = lzw_cs + 1; + codemask = (1 << codesize) - 1; + avail = clear + 2; + oldcode = -1; + first = 0; + } else if (code == clear + 1) { // end of stream code + stbi__skip(s, len); + while ((len = stbi__get8(s)) > 0) + stbi__skip(s, len); + return g->out; + } else if (code <= avail) { + if (first) { + return stbi__errpuc("no clear code", "Corrupt GIF"); + } - if (oldcode >= 0) { - p = &g->codes[avail++]; - if (avail > 8192) { - return stbi__errpuc("too many codes", "Corrupt GIF"); - } + if (oldcode >= 0) { + p = &g->codes[avail++]; + if (avail > 8192) { + return stbi__errpuc("too many codes", "Corrupt GIF"); + } - p->prefix = (stbi__int16) oldcode; - p->first = g->codes[oldcode].first; - p->suffix = (code == avail) ? p->first : g->codes[code].first; - } else if (code == avail) - return stbi__errpuc("illegal code in raster", "Corrupt GIF"); + p->prefix = (stbi__int16)oldcode; + p->first = g->codes[oldcode].first; + p->suffix = (code == avail) ? p->first : g->codes[code].first; + } else if (code == avail) + return stbi__errpuc("illegal code in raster", "Corrupt GIF"); - stbi__out_gif_code(g, (stbi__uint16) code); + stbi__out_gif_code(g, (stbi__uint16)code); - if ((avail & codemask) == 0 && avail <= 0x0FFF) { - codesize++; - codemask = (1 << codesize) - 1; + if ((avail & codemask) == 0 && avail <= 0x0FFF) { + codesize++; + codemask = (1 << codesize) - 1; + } + + oldcode = code; + } else { + return stbi__errpuc("illegal code in raster", "Corrupt GIF"); } - - oldcode = code; - } else { - return stbi__errpuc("illegal code in raster", "Corrupt GIF"); - } - } - } + } + } } // this function is designed to support animated gifs, although stb_image doesn't support it // two back is the image from two frames ago, used for a very specific disposal format -static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back) -{ - int dispose; - int first_frame; - int pi; - int pcount; - STBI_NOTUSED(req_comp); +static stbi_uc * stbi__gif_load_next(stbi__context * s, stbi__gif * g, int * comp, int req_comp, stbi_uc * two_back) { + int dispose; + int first_frame; + int pi; + int pcount; + STBI_NOTUSED(req_comp); - // on first frame, any non-written pixels get the background colour (non-transparent) - first_frame = 0; - if (g->out == 0) { - if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header - if (!stbi__mad3sizes_valid(4, g->w, g->h, 0)) - return stbi__errpuc("too large", "GIF image is too large"); - pcount = g->w * g->h; - g->out = (stbi_uc *) stbi__malloc(4 * pcount); - g->background = (stbi_uc *) stbi__malloc(4 * pcount); - g->history = (stbi_uc *) stbi__malloc(pcount); - if (!g->out || !g->background || !g->history) - return stbi__errpuc("outofmem", "Out of memory"); + // on first frame, any non-written pixels get the background colour (non-transparent) + first_frame = 0; + if (g->out == 0) { + if (!stbi__gif_header(s, g, comp, 0)) + return 0; // stbi__g_failure_reason set by stbi__gif_header + if (!stbi__mad3sizes_valid(4, g->w, g->h, 0)) + return stbi__errpuc("too large", "GIF image is too large"); + pcount = g->w * g->h; + g->out = (stbi_uc *)stbi__malloc(4 * pcount); + g->background = (stbi_uc *)stbi__malloc(4 * pcount); + g->history = (stbi_uc *)stbi__malloc(pcount); + if (!g->out || !g->background || !g->history) + return stbi__errpuc("outofmem", "Out of memory"); - // image is treated as "transparent" at the start - ie, nothing overwrites the current background; - // background colour is only used for pixels that are not rendered first frame, after that "background" - // color refers to the color that was there the previous frame. - memset(g->out, 0x00, 4 * pcount); - memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent) - memset(g->history, 0x00, pcount); // pixels that were affected previous frame - first_frame = 1; - } else { - // second frame - how do we dispose of the previous one? - dispose = (g->eflags & 0x1C) >> 2; - pcount = g->w * g->h; + // image is treated as "transparent" at the start - ie, nothing overwrites the current background; + // background colour is only used for pixels that are not rendered first frame, after that "background" + // color refers to the color that was there the previous frame. + memset(g->out, 0x00, 4 * pcount); + memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent) + memset(g->history, 0x00, pcount); // pixels that were affected previous frame + first_frame = 1; + } else { + // second frame - how do we dispose of the previous one? + dispose = (g->eflags & 0x1C) >> 2; + pcount = g->w * g->h; - if ((dispose == 3) && (two_back == 0)) { - dispose = 2; // if I don't have an image to revert back to, default to the old background - } + if ((dispose == 3) && (two_back == 0)) { + dispose = 2; // if I don't have an image to revert back to, default to the old background + } - if (dispose == 3) { // use previous graphic - for (pi = 0; pi < pcount; ++pi) { - if (g->history[pi]) { - memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); + if (dispose == 3) { // use previous graphic + for (pi = 0; pi < pcount; ++pi) { + if (g->history[pi]) { + memcpy(&g->out[pi * 4], &two_back[pi * 4], 4); + } } - } - } else if (dispose == 2) { - // restore what was changed last frame to background before that frame; - for (pi = 0; pi < pcount; ++pi) { - if (g->history[pi]) { - memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); + } else if (dispose == 2) { + // restore what was changed last frame to background before that frame; + for (pi = 0; pi < pcount; ++pi) { + if (g->history[pi]) { + memcpy(&g->out[pi * 4], &g->background[pi * 4], 4); + } } - } - } else { - // This is a non-disposal case eithe way, so just - // leave the pixels as is, and they will become the new background - // 1: do not dispose - // 0: not specified. - } + } else { + // This is a non-disposal case eithe way, so just + // leave the pixels as is, and they will become the new background + // 1: do not dispose + // 0: not specified. + } - // background is what out is after the undoing of the previou frame; - memcpy( g->background, g->out, 4 * g->w * g->h ); - } + // background is what out is after the undoing of the previou frame; + memcpy(g->background, g->out, 4 * g->w * g->h); + } - // clear my history; - memset( g->history, 0x00, g->w * g->h ); // pixels that were affected previous frame + // clear my history; + memset(g->history, 0x00, g->w * g->h); // pixels that were affected previous frame - for (;;) { - int tag = stbi__get8(s); - switch (tag) { - case 0x2C: /* Image Descriptor */ - { + for (;;) { + int tag = stbi__get8(s); + switch (tag) { + case 0x2C: /* Image Descriptor */ + { stbi__int32 x, y, w, h; - stbi_uc *o; + stbi_uc * o; x = stbi__get16le(s); y = stbi__get16le(s); w = stbi__get16le(s); h = stbi__get16le(s); if (((x + w) > (g->w)) || ((y + h) > (g->h))) - return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); + return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); g->line_size = g->w * 4; g->start_x = x * 4; g->start_y = y * g->line_size; - g->max_x = g->start_x + w * 4; - g->max_y = g->start_y + h * g->line_size; - g->cur_x = g->start_x; - g->cur_y = g->start_y; + g->max_x = g->start_x + w * 4; + g->max_y = g->start_y + h * g->line_size; + g->cur_x = g->start_x; + g->cur_y = g->start_y; // if the width of the specified rectangle is 0, that means // we may not see *any* pixels or the image is malformed; // to make sure this is caught, move the current y down to // max_y (which is what out_gif_code checks). if (w == 0) - g->cur_y = g->max_y; + g->cur_y = g->max_y; g->lflags = stbi__get8(s); if (g->lflags & 0x40) { - g->step = 8 * g->line_size; // first interlaced spacing - g->parse = 3; + g->step = 8 * g->line_size; // first interlaced spacing + g->parse = 3; } else { - g->step = g->line_size; - g->parse = 0; + g->step = g->line_size; + g->parse = 0; } if (g->lflags & 0x80) { - stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1); - g->color_table = (stbi_uc *) g->lpal; + stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1); + g->color_table = (stbi_uc *)g->lpal; } else if (g->flags & 0x80) { - g->color_table = (stbi_uc *) g->pal; + g->color_table = (stbi_uc *)g->pal; } else - return stbi__errpuc("missing color table", "Corrupt GIF"); + return stbi__errpuc("missing color table", "Corrupt GIF"); o = stbi__process_gif_raster(s, g); - if (!o) return NULL; + if (!o) + return NULL; // if this was the first frame, pcount = g->w * g->h; if (first_frame && (g->bgindex > 0)) { - // if first frame, any pixel not drawn to gets the background color - for (pi = 0; pi < pcount; ++pi) { - if (g->history[pi] == 0) { - g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; - memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); - } - } + // if first frame, any pixel not drawn to gets the background color + for (pi = 0; pi < pcount; ++pi) { + if (g->history[pi] == 0) { + g->pal[g->bgindex][3] = + 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; + memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4); + } + } } return o; - } + } - case 0x21: // Comment Extension. - { + case 0x21: // Comment Extension. + { int len; int ext = stbi__get8(s); if (ext == 0xF9) { // Graphic Control Extension. - len = stbi__get8(s); - if (len == 4) { - g->eflags = stbi__get8(s); - g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths. + len = stbi__get8(s); + if (len == 4) { + g->eflags = stbi__get8(s); + g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths. - // unset old transparent - if (g->transparent >= 0) { - g->pal[g->transparent][3] = 255; - } - if (g->eflags & 0x01) { - g->transparent = stbi__get8(s); - if (g->transparent >= 0) { - g->pal[g->transparent][3] = 0; - } - } else { - // don't need transparent - stbi__skip(s, 1); - g->transparent = -1; - } - } else { - stbi__skip(s, len); - break; - } + // unset old transparent + if (g->transparent >= 0) { + g->pal[g->transparent][3] = 255; + } + if (g->eflags & 0x01) { + g->transparent = stbi__get8(s); + if (g->transparent >= 0) { + g->pal[g->transparent][3] = 0; + } + } else { + // don't need transparent + stbi__skip(s, 1); + g->transparent = -1; + } + } else { + stbi__skip(s, len); + break; + } } while ((len = stbi__get8(s)) != 0) { - stbi__skip(s, len); + stbi__skip(s, len); } break; - } + } - case 0x3B: // gif stream termination code - return (stbi_uc *) s; // using '1' causes warning on some compilers + case 0x3B: // gif stream termination code + return (stbi_uc *)s; // using '1' causes warning on some compilers - default: + default: return stbi__errpuc("unknown code", "Corrupt GIF"); - } - } + } + } } -static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays) -{ - STBI_FREE(g->out); - STBI_FREE(g->history); - STBI_FREE(g->background); +static void * stbi__load_gif_main_outofmem(stbi__gif * g, stbi_uc * out, int ** delays) { + STBI_FREE(g->out); + STBI_FREE(g->history); + STBI_FREE(g->background); - if (out) STBI_FREE(out); - if (delays && *delays) STBI_FREE(*delays); - return stbi__errpuc("outofmem", "Out of memory"); + if (out) + STBI_FREE(out); + if (delays && *delays) + STBI_FREE(*delays); + return stbi__errpuc("outofmem", "Out of memory"); } -static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp) -{ - if (stbi__gif_test(s)) { - int layers = 0; - stbi_uc *u = 0; - stbi_uc *out = 0; - stbi_uc *two_back = 0; - stbi__gif g; - int stride; - int out_size = 0; - int delays_size = 0; +static void * stbi__load_gif_main(stbi__context * s, int ** delays, int * x, int * y, int * z, int * comp, int req_comp) { + if (stbi__gif_test(s)) { + int layers = 0; + stbi_uc * u = 0; + stbi_uc * out = 0; + stbi_uc * two_back = 0; + stbi__gif g; + int stride; + int out_size = 0; + int delays_size = 0; - STBI_NOTUSED(out_size); - STBI_NOTUSED(delays_size); + STBI_NOTUSED(out_size); + STBI_NOTUSED(delays_size); - memset(&g, 0, sizeof(g)); - if (delays) { - *delays = 0; - } + memset(&g, 0, sizeof(g)); + if (delays) { + *delays = 0; + } - do { - u = stbi__gif_load_next(s, &g, comp, req_comp, two_back); - if (u == (stbi_uc *) s) u = 0; // end of animated gif marker + do { + u = stbi__gif_load_next(s, &g, comp, req_comp, two_back); + if (u == (stbi_uc *)s) + u = 0; // end of animated gif marker - if (u) { - *x = g.w; - *y = g.h; - ++layers; - stride = g.w * g.h * 4; + if (u) { + *x = g.w; + *y = g.h; + ++layers; + stride = g.w * g.h * 4; - if (out) { - void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride ); - if (!tmp) - return stbi__load_gif_main_outofmem(&g, out, delays); - else { - out = (stbi_uc*) tmp; - out_size = layers * stride; - } + if (out) { + void * tmp = (stbi_uc *)STBI_REALLOC_SIZED(out, out_size, layers * stride); + if (!tmp) + return stbi__load_gif_main_outofmem(&g, out, delays); + else { + out = (stbi_uc *)tmp; + out_size = layers * stride; + } - if (delays) { - int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers ); - if (!new_delays) - return stbi__load_gif_main_outofmem(&g, out, delays); - *delays = new_delays; - delays_size = layers * sizeof(int); - } - } else { - out = (stbi_uc*)stbi__malloc( layers * stride ); - if (!out) - return stbi__load_gif_main_outofmem(&g, out, delays); - out_size = layers * stride; - if (delays) { - *delays = (int*) stbi__malloc( layers * sizeof(int) ); - if (!*delays) - return stbi__load_gif_main_outofmem(&g, out, delays); - delays_size = layers * sizeof(int); - } - } - memcpy( out + ((layers - 1) * stride), u, stride ); - if (layers >= 2) { - two_back = out - 2 * stride; + if (delays) { + int * new_delays = (int *)STBI_REALLOC_SIZED(*delays, delays_size, sizeof(int) * layers); + if (!new_delays) + return stbi__load_gif_main_outofmem(&g, out, delays); + *delays = new_delays; + delays_size = layers * sizeof(int); + } + } else { + out = (stbi_uc *)stbi__malloc(layers * stride); + if (!out) + return stbi__load_gif_main_outofmem(&g, out, delays); + out_size = layers * stride; + if (delays) { + *delays = (int *)stbi__malloc(layers * sizeof(int)); + if (!*delays) + return stbi__load_gif_main_outofmem(&g, out, delays); + delays_size = layers * sizeof(int); + } + } + memcpy(out + ((layers - 1) * stride), u, stride); + if (layers >= 2) { + two_back = out - 2 * stride; + } + + if (delays) { + (*delays)[layers - 1U] = g.delay; + } } + } while (u != 0); - if (delays) { - (*delays)[layers - 1U] = g.delay; - } - } - } while (u != 0); + // free temp buffer; + STBI_FREE(g.out); + STBI_FREE(g.history); + STBI_FREE(g.background); - // free temp buffer; - STBI_FREE(g.out); - STBI_FREE(g.history); - STBI_FREE(g.background); + // do the final conversion after loading everything; + if (req_comp && req_comp != 4) + out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h); - // do the final conversion after loading everything; - if (req_comp && req_comp != 4) - out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h); - - *z = layers; - return out; - } else { - return stbi__errpuc("not GIF", "Image was not as a gif type."); - } + *z = layers; + return out; + } else { + return stbi__errpuc("not GIF", "Image was not as a gif type."); + } } -static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - stbi_uc *u = 0; - stbi__gif g; - memset(&g, 0, sizeof(g)); - STBI_NOTUSED(ri); +static void * stbi__gif_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + stbi_uc * u = 0; + stbi__gif g; + memset(&g, 0, sizeof(g)); + STBI_NOTUSED(ri); - u = stbi__gif_load_next(s, &g, comp, req_comp, 0); - if (u == (stbi_uc *) s) u = 0; // end of animated gif marker - if (u) { - *x = g.w; - *y = g.h; + u = stbi__gif_load_next(s, &g, comp, req_comp, 0); + if (u == (stbi_uc *)s) + u = 0; // end of animated gif marker + if (u) { + *x = g.w; + *y = g.h; - // moved conversion to after successful load so that the same - // can be done for multiple frames. - if (req_comp && req_comp != 4) - u = stbi__convert_format(u, 4, req_comp, g.w, g.h); - } else if (g.out) { - // if there was an error and we allocated an image buffer, free it! - STBI_FREE(g.out); - } + // moved conversion to after successful load so that the same + // can be done for multiple frames. + if (req_comp && req_comp != 4) + u = stbi__convert_format(u, 4, req_comp, g.w, g.h); + } else if (g.out) { + // if there was an error and we allocated an image buffer, free it! + STBI_FREE(g.out); + } - // free buffers needed for multiple frame loading; - STBI_FREE(g.history); - STBI_FREE(g.background); + // free buffers needed for multiple frame loading; + STBI_FREE(g.history); + STBI_FREE(g.background); - return u; + return u; } -static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp) -{ - return stbi__gif_info_raw(s,x,y,comp); -} +static int stbi__gif_info(stbi__context * s, int * x, int * y, int * comp) { return stbi__gif_info_raw(s, x, y, comp); } #endif // ************************************************************************************************* // Radiance RGBE HDR loader // originally by Nicolas Schulz #ifndef STBI_NO_HDR -static int stbi__hdr_test_core(stbi__context *s, const char *signature) -{ - int i; - for (i=0; signature[i]; ++i) - if (stbi__get8(s) != signature[i]) - return 0; - stbi__rewind(s); - return 1; +static int stbi__hdr_test_core(stbi__context * s, const char * signature) { + int i; + for (i = 0; signature[i]; ++i) + if (stbi__get8(s) != signature[i]) + return 0; + stbi__rewind(s); + return 1; } -static int stbi__hdr_test(stbi__context* s) -{ - int r = stbi__hdr_test_core(s, "#?RADIANCE\n"); - stbi__rewind(s); - if(!r) { - r = stbi__hdr_test_core(s, "#?RGBE\n"); - stbi__rewind(s); - } - return r; +static int stbi__hdr_test(stbi__context * s) { + int r = stbi__hdr_test_core(s, "#?RADIANCE\n"); + stbi__rewind(s); + if (!r) { + r = stbi__hdr_test_core(s, "#?RGBE\n"); + stbi__rewind(s); + } + return r; } -#define STBI__HDR_BUFLEN 1024 -static char *stbi__hdr_gettoken(stbi__context *z, char *buffer) -{ - int len=0; - char c = '\0'; +#define STBI__HDR_BUFLEN 1024 +static char * stbi__hdr_gettoken(stbi__context * z, char * buffer) { + int len = 0; + char c = '\0'; - c = (char) stbi__get8(z); + c = (char)stbi__get8(z); - while (!stbi__at_eof(z) && c != '\n') { - buffer[len++] = c; - if (len == STBI__HDR_BUFLEN-1) { - // flush to end of line - while (!stbi__at_eof(z) && stbi__get8(z) != '\n') - ; - break; - } - c = (char) stbi__get8(z); - } + while (!stbi__at_eof(z) && c != '\n') { + buffer[len++] = c; + if (len == STBI__HDR_BUFLEN - 1) { + // flush to end of line + while (!stbi__at_eof(z) && stbi__get8(z) != '\n') + ; + break; + } + c = (char)stbi__get8(z); + } - buffer[len] = 0; - return buffer; + buffer[len] = 0; + return buffer; } -static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp) -{ - if ( input[3] != 0 ) { - float f1; - // Exponent - f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8)); - if (req_comp <= 2) - output[0] = (input[0] + input[1] + input[2]) * f1 / 3; - else { - output[0] = input[0] * f1; - output[1] = input[1] * f1; - output[2] = input[2] * f1; - } - if (req_comp == 2) output[1] = 1; - if (req_comp == 4) output[3] = 1; - } else { - switch (req_comp) { - case 4: output[3] = 1; /* fallthrough */ - case 3: output[0] = output[1] = output[2] = 0; - break; - case 2: output[1] = 1; /* fallthrough */ - case 1: output[0] = 0; - break; - } - } +static void stbi__hdr_convert(float * output, stbi_uc * input, int req_comp) { + if (input[3] != 0) { + float f1; + // Exponent + f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8)); + if (req_comp <= 2) + output[0] = (input[0] + input[1] + input[2]) * f1 / 3; + else { + output[0] = input[0] * f1; + output[1] = input[1] * f1; + output[2] = input[2] * f1; + } + if (req_comp == 2) + output[1] = 1; + if (req_comp == 4) + output[3] = 1; + } else { + switch (req_comp) { + case 4: + output[3] = 1; /* fallthrough */ + case 3: + output[0] = output[1] = output[2] = 0; + break; + case 2: + output[1] = 1; /* fallthrough */ + case 1: + output[0] = 0; + break; + } + } } -static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - char buffer[STBI__HDR_BUFLEN]; - char *token; - int valid = 0; - int width, height; - stbi_uc *scanline; - float *hdr_data; - int len; - unsigned char count, value; - int i, j, k, c1,c2, z; - const char *headerToken; - STBI_NOTUSED(ri); +static float * stbi__hdr_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + char buffer[STBI__HDR_BUFLEN]; + char * token; + int valid = 0; + int width, height; + stbi_uc * scanline; + float * hdr_data; + int len; + unsigned char count, value; + int i, j, k, c1, c2, z; + const char * headerToken; + STBI_NOTUSED(ri); - // Check identifier - headerToken = stbi__hdr_gettoken(s,buffer); - if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0) - return stbi__errpf("not HDR", "Corrupt HDR image"); + // Check identifier + headerToken = stbi__hdr_gettoken(s, buffer); + if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0) + return stbi__errpf("not HDR", "Corrupt HDR image"); - // Parse header - for(;;) { - token = stbi__hdr_gettoken(s,buffer); - if (token[0] == 0) break; - if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1; - } + // Parse header + for (;;) { + token = stbi__hdr_gettoken(s, buffer); + if (token[0] == 0) + break; + if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) + valid = 1; + } - if (!valid) return stbi__errpf("unsupported format", "Unsupported HDR format"); + if (!valid) + return stbi__errpf("unsupported format", "Unsupported HDR format"); - // Parse width and height - // can't use sscanf() if we're not using stdio! - token = stbi__hdr_gettoken(s,buffer); - if (strncmp(token, "-Y ", 3)) return stbi__errpf("unsupported data layout", "Unsupported HDR format"); - token += 3; - height = (int) strtol(token, &token, 10); - while (*token == ' ') ++token; - if (strncmp(token, "+X ", 3)) return stbi__errpf("unsupported data layout", "Unsupported HDR format"); - token += 3; - width = (int) strtol(token, NULL, 10); + // Parse width and height + // can't use sscanf() if we're not using stdio! + token = stbi__hdr_gettoken(s, buffer); + if (strncmp(token, "-Y ", 3)) + return stbi__errpf("unsupported data layout", "Unsupported HDR format"); + token += 3; + height = (int)strtol(token, &token, 10); + while (*token == ' ') + ++token; + if (strncmp(token, "+X ", 3)) + return stbi__errpf("unsupported data layout", "Unsupported HDR format"); + token += 3; + width = (int)strtol(token, NULL, 10); - if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)"); - if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)"); + if (height > STBI_MAX_DIMENSIONS) + return stbi__errpf("too large", "Very large image (corrupt?)"); + if (width > STBI_MAX_DIMENSIONS) + return stbi__errpf("too large", "Very large image (corrupt?)"); - *x = width; - *y = height; + *x = width; + *y = height; - if (comp) *comp = 3; - if (req_comp == 0) req_comp = 3; + if (comp) + *comp = 3; + if (req_comp == 0) + req_comp = 3; - if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) - return stbi__errpf("too large", "HDR image is too large"); + if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) + return stbi__errpf("too large", "HDR image is too large"); - // Read data - hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); - if (!hdr_data) - return stbi__errpf("outofmem", "Out of memory"); + // Read data + hdr_data = (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); + if (!hdr_data) + return stbi__errpf("outofmem", "Out of memory"); - // Load image data - // image data is stored as some number of sca - if ( width < 8 || width >= 32768) { - // Read flat data - for (j=0; j < height; ++j) { - for (i=0; i < width; ++i) { - stbi_uc rgbe[4]; - main_decode_loop: - stbi__getn(s, rgbe, 4); - stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp); - } - } - } else { - // Read RLE-encoded data - scanline = NULL; + // Load image data + // image data is stored as some number of sca + if (width < 8 || width >= 32768) { + // Read flat data + for (j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) { + stbi_uc rgbe[4]; + main_decode_loop: + stbi__getn(s, rgbe, 4); + stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp); + } + } + } else { + // Read RLE-encoded data + scanline = NULL; - for (j = 0; j < height; ++j) { - c1 = stbi__get8(s); - c2 = stbi__get8(s); - len = stbi__get8(s); - if (c1 != 2 || c2 != 2 || (len & 0x80)) { - // not run-length encoded, so we have to actually use THIS data as a decoded - // pixel (note this can't be a valid pixel--one of RGB must be >= 128) - stbi_uc rgbe[4]; - rgbe[0] = (stbi_uc) c1; - rgbe[1] = (stbi_uc) c2; - rgbe[2] = (stbi_uc) len; - rgbe[3] = (stbi_uc) stbi__get8(s); - stbi__hdr_convert(hdr_data, rgbe, req_comp); - i = 1; - j = 0; + for (j = 0; j < height; ++j) { + c1 = stbi__get8(s); + c2 = stbi__get8(s); + len = stbi__get8(s); + if (c1 != 2 || c2 != 2 || (len & 0x80)) { + // not run-length encoded, so we have to actually use THIS data as a decoded + // pixel (note this can't be a valid pixel--one of RGB must be >= 128) + stbi_uc rgbe[4]; + rgbe[0] = (stbi_uc)c1; + rgbe[1] = (stbi_uc)c2; + rgbe[2] = (stbi_uc)len; + rgbe[3] = (stbi_uc)stbi__get8(s); + stbi__hdr_convert(hdr_data, rgbe, req_comp); + i = 1; + j = 0; + STBI_FREE(scanline); + goto main_decode_loop; // yes, this makes no sense + } + len <<= 8; + len |= stbi__get8(s); + if (len != width) { + STBI_FREE(hdr_data); + STBI_FREE(scanline); + return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); + } + if (scanline == NULL) { + scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0); + if (!scanline) { + STBI_FREE(hdr_data); + return stbi__errpf("outofmem", "Out of memory"); + } + } + + for (k = 0; k < 4; ++k) { + int nleft; + i = 0; + while ((nleft = width - i) > 0) { + count = stbi__get8(s); + if (count > 128) { + // Run + value = stbi__get8(s); + count -= 128; + if ((count == 0) || (count > nleft)) { + STBI_FREE(hdr_data); + STBI_FREE(scanline); + return stbi__errpf("corrupt", "bad RLE data in HDR"); + } + for (z = 0; z < count; ++z) + scanline[i++ * 4 + k] = value; + } else { + // Dump + if ((count == 0) || (count > nleft)) { + STBI_FREE(hdr_data); + STBI_FREE(scanline); + return stbi__errpf("corrupt", "bad RLE data in HDR"); + } + for (z = 0; z < count; ++z) + scanline[i++ * 4 + k] = stbi__get8(s); + } + } + } + for (i = 0; i < width; ++i) + stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp); + } + if (scanline) STBI_FREE(scanline); - goto main_decode_loop; // yes, this makes no sense - } - len <<= 8; - len |= stbi__get8(s); - if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); } - if (scanline == NULL) { - scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0); - if (!scanline) { - STBI_FREE(hdr_data); - return stbi__errpf("outofmem", "Out of memory"); - } - } + } - for (k = 0; k < 4; ++k) { - int nleft; - i = 0; - while ((nleft = width - i) > 0) { - count = stbi__get8(s); - if (count > 128) { - // Run - value = stbi__get8(s); - count -= 128; - if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } - for (z = 0; z < count; ++z) - scanline[i++ * 4 + k] = value; - } else { - // Dump - if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } - for (z = 0; z < count; ++z) - scanline[i++ * 4 + k] = stbi__get8(s); - } - } - } - for (i=0; i < width; ++i) - stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp); - } - if (scanline) - STBI_FREE(scanline); - } - - return hdr_data; + return hdr_data; } -static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp) -{ - char buffer[STBI__HDR_BUFLEN]; - char *token; - int valid = 0; - int dummy; +static int stbi__hdr_info(stbi__context * s, int * x, int * y, int * comp) { + char buffer[STBI__HDR_BUFLEN]; + char * token; + int valid = 0; + int dummy; - if (!x) x = &dummy; - if (!y) y = &dummy; - if (!comp) comp = &dummy; + if (!x) + x = &dummy; + if (!y) + y = &dummy; + if (!comp) + comp = &dummy; - if (stbi__hdr_test(s) == 0) { - stbi__rewind( s ); - return 0; - } + if (stbi__hdr_test(s) == 0) { + stbi__rewind(s); + return 0; + } - for(;;) { - token = stbi__hdr_gettoken(s,buffer); - if (token[0] == 0) break; - if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1; - } + for (;;) { + token = stbi__hdr_gettoken(s, buffer); + if (token[0] == 0) + break; + if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) + valid = 1; + } - if (!valid) { - stbi__rewind( s ); - return 0; - } - token = stbi__hdr_gettoken(s,buffer); - if (strncmp(token, "-Y ", 3)) { - stbi__rewind( s ); - return 0; - } - token += 3; - *y = (int) strtol(token, &token, 10); - while (*token == ' ') ++token; - if (strncmp(token, "+X ", 3)) { - stbi__rewind( s ); - return 0; - } - token += 3; - *x = (int) strtol(token, NULL, 10); - *comp = 3; - return 1; + if (!valid) { + stbi__rewind(s); + return 0; + } + token = stbi__hdr_gettoken(s, buffer); + if (strncmp(token, "-Y ", 3)) { + stbi__rewind(s); + return 0; + } + token += 3; + *y = (int)strtol(token, &token, 10); + while (*token == ' ') + ++token; + if (strncmp(token, "+X ", 3)) { + stbi__rewind(s); + return 0; + } + token += 3; + *x = (int)strtol(token, NULL, 10); + *comp = 3; + return 1; } #endif // STBI_NO_HDR #ifndef STBI_NO_BMP -static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp) -{ - void *p; - stbi__bmp_data info; +static int stbi__bmp_info(stbi__context * s, int * x, int * y, int * comp) { + void * p; + stbi__bmp_data info; - info.all_a = 255; - p = stbi__bmp_parse_header(s, &info); - if (p == NULL) { - stbi__rewind( s ); - return 0; - } - if (x) *x = s->img_x; - if (y) *y = s->img_y; - if (comp) { - if (info.bpp == 24 && info.ma == 0xff000000) - *comp = 3; - else - *comp = info.ma ? 4 : 3; - } - return 1; + info.all_a = 255; + p = stbi__bmp_parse_header(s, &info); + if (p == NULL) { + stbi__rewind(s); + return 0; + } + if (x) + *x = s->img_x; + if (y) + *y = s->img_y; + if (comp) { + if (info.bpp == 24 && info.ma == 0xff000000) + *comp = 3; + else + *comp = info.ma ? 4 : 3; + } + return 1; } #endif #ifndef STBI_NO_PSD -static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp) -{ - int channelCount, dummy, depth; - if (!x) x = &dummy; - if (!y) y = &dummy; - if (!comp) comp = &dummy; - if (stbi__get32be(s) != 0x38425053) { - stbi__rewind( s ); - return 0; - } - if (stbi__get16be(s) != 1) { - stbi__rewind( s ); - return 0; - } - stbi__skip(s, 6); - channelCount = stbi__get16be(s); - if (channelCount < 0 || channelCount > 16) { - stbi__rewind( s ); - return 0; - } - *y = stbi__get32be(s); - *x = stbi__get32be(s); - depth = stbi__get16be(s); - if (depth != 8 && depth != 16) { - stbi__rewind( s ); - return 0; - } - if (stbi__get16be(s) != 3) { - stbi__rewind( s ); - return 0; - } - *comp = 4; - return 1; +static int stbi__psd_info(stbi__context * s, int * x, int * y, int * comp) { + int channelCount, dummy, depth; + if (!x) + x = &dummy; + if (!y) + y = &dummy; + if (!comp) + comp = &dummy; + if (stbi__get32be(s) != 0x38425053) { + stbi__rewind(s); + return 0; + } + if (stbi__get16be(s) != 1) { + stbi__rewind(s); + return 0; + } + stbi__skip(s, 6); + channelCount = stbi__get16be(s); + if (channelCount < 0 || channelCount > 16) { + stbi__rewind(s); + return 0; + } + *y = stbi__get32be(s); + *x = stbi__get32be(s); + depth = stbi__get16be(s); + if (depth != 8 && depth != 16) { + stbi__rewind(s); + return 0; + } + if (stbi__get16be(s) != 3) { + stbi__rewind(s); + return 0; + } + *comp = 4; + return 1; } -static int stbi__psd_is16(stbi__context *s) -{ - int channelCount, depth; - if (stbi__get32be(s) != 0x38425053) { - stbi__rewind( s ); - return 0; - } - if (stbi__get16be(s) != 1) { - stbi__rewind( s ); - return 0; - } - stbi__skip(s, 6); - channelCount = stbi__get16be(s); - if (channelCount < 0 || channelCount > 16) { - stbi__rewind( s ); - return 0; - } - STBI_NOTUSED(stbi__get32be(s)); - STBI_NOTUSED(stbi__get32be(s)); - depth = stbi__get16be(s); - if (depth != 16) { - stbi__rewind( s ); - return 0; - } - return 1; +static int stbi__psd_is16(stbi__context * s) { + int channelCount, depth; + if (stbi__get32be(s) != 0x38425053) { + stbi__rewind(s); + return 0; + } + if (stbi__get16be(s) != 1) { + stbi__rewind(s); + return 0; + } + stbi__skip(s, 6); + channelCount = stbi__get16be(s); + if (channelCount < 0 || channelCount > 16) { + stbi__rewind(s); + return 0; + } + STBI_NOTUSED(stbi__get32be(s)); + STBI_NOTUSED(stbi__get32be(s)); + depth = stbi__get16be(s); + if (depth != 16) { + stbi__rewind(s); + return 0; + } + return 1; } #endif #ifndef STBI_NO_PIC -static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) -{ - int act_comp=0,num_packets=0,chained,dummy; - stbi__pic_packet packets[10]; +static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp) { + int act_comp = 0, num_packets = 0, chained, dummy; + stbi__pic_packet packets[10]; - if (!x) x = &dummy; - if (!y) y = &dummy; - if (!comp) comp = &dummy; + if (!x) + x = &dummy; + if (!y) + y = &dummy; + if (!comp) + comp = &dummy; - if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) { - stbi__rewind(s); - return 0; - } + if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) { + stbi__rewind(s); + return 0; + } - stbi__skip(s, 88); + stbi__skip(s, 88); - *x = stbi__get16be(s); - *y = stbi__get16be(s); - if (stbi__at_eof(s)) { - stbi__rewind( s); - return 0; - } - if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) { - stbi__rewind( s ); - return 0; - } + *x = stbi__get16be(s); + *y = stbi__get16be(s); + if (stbi__at_eof(s)) { + stbi__rewind(s); + return 0; + } + if ((*x) != 0 && (1 << 28) / (*x) < (*y)) { + stbi__rewind(s); + return 0; + } - stbi__skip(s, 8); + stbi__skip(s, 8); - do { - stbi__pic_packet *packet; + do { + stbi__pic_packet * packet; - if (num_packets==sizeof(packets)/sizeof(packets[0])) - return 0; + if (num_packets == sizeof(packets) / sizeof(packets[0])) + return 0; - packet = &packets[num_packets++]; - chained = stbi__get8(s); - packet->size = stbi__get8(s); - packet->type = stbi__get8(s); - packet->channel = stbi__get8(s); - act_comp |= packet->channel; + packet = &packets[num_packets++]; + chained = stbi__get8(s); + packet->size = stbi__get8(s); + packet->type = stbi__get8(s); + packet->channel = stbi__get8(s); + act_comp |= packet->channel; - if (stbi__at_eof(s)) { - stbi__rewind( s ); - return 0; - } - if (packet->size != 8) { - stbi__rewind( s ); - return 0; - } - } while (chained); + if (stbi__at_eof(s)) { + stbi__rewind(s); + return 0; + } + if (packet->size != 8) { + stbi__rewind(s); + return 0; + } + } while (chained); - *comp = (act_comp & 0x10 ? 4 : 3); + *comp = (act_comp & 0x10 ? 4 : 3); - return 1; + return 1; } #endif @@ -7491,272 +7904,271 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) #ifndef STBI_NO_PNM -static int stbi__pnm_test(stbi__context *s) -{ - char p, t; - p = (char) stbi__get8(s); - t = (char) stbi__get8(s); - if (p != 'P' || (t != '5' && t != '6')) { - stbi__rewind( s ); - return 0; - } - return 1; +static int stbi__pnm_test(stbi__context * s) { + char p, t; + p = (char)stbi__get8(s); + t = (char)stbi__get8(s); + if (p != 'P' || (t != '5' && t != '6')) { + stbi__rewind(s); + return 0; + } + return 1; } -static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri) -{ - stbi_uc *out; - STBI_NOTUSED(ri); +static void * stbi__pnm_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) { + stbi_uc * out; + STBI_NOTUSED(ri); - ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n); - if (ri->bits_per_channel == 0) - return 0; + ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n); + if (ri->bits_per_channel == 0) + return 0; - if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); - if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (s->img_y > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) + return stbi__errpuc("too large", "Very large image (corrupt?)"); - *x = s->img_x; - *y = s->img_y; - if (comp) *comp = s->img_n; + *x = s->img_x; + *y = s->img_y; + if (comp) + *comp = s->img_n; - if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0)) - return stbi__errpuc("too large", "PNM too large"); + if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0)) + return stbi__errpuc("too large", "PNM too large"); - out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0); - if (!out) return stbi__errpuc("outofmem", "Out of memory"); - if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) { - STBI_FREE(out); - return stbi__errpuc("bad PNM", "PNM file truncated"); - } + out = (stbi_uc *)stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0); + if (!out) + return stbi__errpuc("outofmem", "Out of memory"); + if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) { + STBI_FREE(out); + return stbi__errpuc("bad PNM", "PNM file truncated"); + } - if (req_comp && req_comp != s->img_n) { - if (ri->bits_per_channel == 16) { - out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y); - } else { - out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); - } - if (out == NULL) return out; // stbi__convert_format frees input on failure - } - return out; + if (req_comp && req_comp != s->img_n) { + if (ri->bits_per_channel == 16) { + out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, s->img_n, req_comp, s->img_x, s->img_y); + } else { + out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); + } + if (out == NULL) + return out; // stbi__convert_format frees input on failure + } + return out; } -static int stbi__pnm_isspace(char c) -{ - return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; +static int stbi__pnm_isspace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; } + +static void stbi__pnm_skip_whitespace(stbi__context * s, char * c) { + for (;;) { + while (!stbi__at_eof(s) && stbi__pnm_isspace(*c)) + *c = (char)stbi__get8(s); + + if (stbi__at_eof(s) || *c != '#') + break; + + while (!stbi__at_eof(s) && *c != '\n' && *c != '\r') + *c = (char)stbi__get8(s); + } } -static void stbi__pnm_skip_whitespace(stbi__context *s, char *c) -{ - for (;;) { - while (!stbi__at_eof(s) && stbi__pnm_isspace(*c)) - *c = (char) stbi__get8(s); +static int stbi__pnm_isdigit(char c) { return c >= '0' && c <= '9'; } - if (stbi__at_eof(s) || *c != '#') - break; +static int stbi__pnm_getinteger(stbi__context * s, char * c) { + int value = 0; - while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' ) - *c = (char) stbi__get8(s); - } + while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) { + value = value * 10 + (*c - '0'); + *c = (char)stbi__get8(s); + if ((value > 214748364) || (value == 214748364 && *c > '7')) + return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int"); + } + + return value; } -static int stbi__pnm_isdigit(char c) -{ - return c >= '0' && c <= '9'; +static int stbi__pnm_info(stbi__context * s, int * x, int * y, int * comp) { + int maxv, dummy; + char c, p, t; + + if (!x) + x = &dummy; + if (!y) + y = &dummy; + if (!comp) + comp = &dummy; + + stbi__rewind(s); + + // Get identifier + p = (char)stbi__get8(s); + t = (char)stbi__get8(s); + if (p != 'P' || (t != '5' && t != '6')) { + stbi__rewind(s); + return 0; + } + + *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm + + c = (char)stbi__get8(s); + stbi__pnm_skip_whitespace(s, &c); + + *x = stbi__pnm_getinteger(s, &c); // read width + if (*x == 0) + return stbi__err("invalid width", "PPM image header had zero or overflowing width"); + stbi__pnm_skip_whitespace(s, &c); + + *y = stbi__pnm_getinteger(s, &c); // read height + if (*y == 0) + return stbi__err("invalid width", "PPM image header had zero or overflowing width"); + stbi__pnm_skip_whitespace(s, &c); + + maxv = stbi__pnm_getinteger(s, &c); // read max value + if (maxv > 65535) + return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images"); + else if (maxv > 255) + return 16; + else + return 8; } -static int stbi__pnm_getinteger(stbi__context *s, char *c) -{ - int value = 0; - - while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) { - value = value*10 + (*c - '0'); - *c = (char) stbi__get8(s); - if((value > 214748364) || (value == 214748364 && *c > '7')) - return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int"); - } - - return value; -} - -static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) -{ - int maxv, dummy; - char c, p, t; - - if (!x) x = &dummy; - if (!y) y = &dummy; - if (!comp) comp = &dummy; - - stbi__rewind(s); - - // Get identifier - p = (char) stbi__get8(s); - t = (char) stbi__get8(s); - if (p != 'P' || (t != '5' && t != '6')) { - stbi__rewind(s); - return 0; - } - - *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm - - c = (char) stbi__get8(s); - stbi__pnm_skip_whitespace(s, &c); - - *x = stbi__pnm_getinteger(s, &c); // read width - if(*x == 0) - return stbi__err("invalid width", "PPM image header had zero or overflowing width"); - stbi__pnm_skip_whitespace(s, &c); - - *y = stbi__pnm_getinteger(s, &c); // read height - if (*y == 0) - return stbi__err("invalid width", "PPM image header had zero or overflowing width"); - stbi__pnm_skip_whitespace(s, &c); - - maxv = stbi__pnm_getinteger(s, &c); // read max value - if (maxv > 65535) - return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images"); - else if (maxv > 255) - return 16; - else - return 8; -} - -static int stbi__pnm_is16(stbi__context *s) -{ - if (stbi__pnm_info(s, NULL, NULL, NULL) == 16) - return 1; - return 0; +static int stbi__pnm_is16(stbi__context * s) { + if (stbi__pnm_info(s, NULL, NULL, NULL) == 16) + return 1; + return 0; } #endif -static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp) -{ - #ifndef STBI_NO_JPEG - if (stbi__jpeg_info(s, x, y, comp)) return 1; - #endif +static int stbi__info_main(stbi__context * s, int * x, int * y, int * comp) { +#ifndef STBI_NO_JPEG + if (stbi__jpeg_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_PNG - if (stbi__png_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_PNG + if (stbi__png_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_GIF - if (stbi__gif_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_GIF + if (stbi__gif_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_BMP - if (stbi__bmp_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_BMP + if (stbi__bmp_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_PSD - if (stbi__psd_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_PSD + if (stbi__psd_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_PIC - if (stbi__pic_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_PIC + if (stbi__pic_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_PNM - if (stbi__pnm_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_PNM + if (stbi__pnm_info(s, x, y, comp)) + return 1; +#endif - #ifndef STBI_NO_HDR - if (stbi__hdr_info(s, x, y, comp)) return 1; - #endif +#ifndef STBI_NO_HDR + if (stbi__hdr_info(s, x, y, comp)) + return 1; +#endif - // test tga last because it's a crappy test! - #ifndef STBI_NO_TGA - if (stbi__tga_info(s, x, y, comp)) - return 1; - #endif - return stbi__err("unknown image type", "Image not of any known type, or corrupt"); +// test tga last because it's a crappy test! +#ifndef STBI_NO_TGA + if (stbi__tga_info(s, x, y, comp)) + return 1; +#endif + return stbi__err("unknown image type", "Image not of any known type, or corrupt"); } -static int stbi__is_16_main(stbi__context *s) -{ - #ifndef STBI_NO_PNG - if (stbi__png_is16(s)) return 1; - #endif +static int stbi__is_16_main(stbi__context * s) { +#ifndef STBI_NO_PNG + if (stbi__png_is16(s)) + return 1; +#endif - #ifndef STBI_NO_PSD - if (stbi__psd_is16(s)) return 1; - #endif +#ifndef STBI_NO_PSD + if (stbi__psd_is16(s)) + return 1; +#endif - #ifndef STBI_NO_PNM - if (stbi__pnm_is16(s)) return 1; - #endif - return 0; +#ifndef STBI_NO_PNM + if (stbi__pnm_is16(s)) + return 1; +#endif + return 0; } #ifndef STBI_NO_STDIO -STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp) -{ - FILE *f = stbi__fopen(filename, "rb"); +STBIDEF int stbi_info(char const * filename, int * x, int * y, int * comp) { + FILE * f = stbi__fopen(filename, "rb"); int result; - if (!f) return stbi__err("can't fopen", "Unable to open file"); + if (!f) + return stbi__err("can't fopen", "Unable to open file"); result = stbi_info_from_file(f, x, y, comp); fclose(f); return result; } -STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp) -{ - int r; - stbi__context s; - long pos = ftell(f); - stbi__start_file(&s, f); - r = stbi__info_main(&s,x,y,comp); - fseek(f,pos,SEEK_SET); - return r; +STBIDEF int stbi_info_from_file(FILE * f, int * x, int * y, int * comp) { + int r; + stbi__context s; + long pos = ftell(f); + stbi__start_file(&s, f); + r = stbi__info_main(&s, x, y, comp); + fseek(f, pos, SEEK_SET); + return r; } -STBIDEF int stbi_is_16_bit(char const *filename) -{ - FILE *f = stbi__fopen(filename, "rb"); +STBIDEF int stbi_is_16_bit(char const * filename) { + FILE * f = stbi__fopen(filename, "rb"); int result; - if (!f) return stbi__err("can't fopen", "Unable to open file"); + if (!f) + return stbi__err("can't fopen", "Unable to open file"); result = stbi_is_16_bit_from_file(f); fclose(f); return result; } -STBIDEF int stbi_is_16_bit_from_file(FILE *f) -{ - int r; - stbi__context s; - long pos = ftell(f); - stbi__start_file(&s, f); - r = stbi__is_16_main(&s); - fseek(f,pos,SEEK_SET); - return r; +STBIDEF int stbi_is_16_bit_from_file(FILE * f) { + int r; + stbi__context s; + long pos = ftell(f); + stbi__start_file(&s, f); + r = stbi__is_16_main(&s); + fseek(f, pos, SEEK_SET); + return r; } #endif // !STBI_NO_STDIO -STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp) -{ - stbi__context s; - stbi__start_mem(&s,buffer,len); - return stbi__info_main(&s,x,y,comp); +STBIDEF int stbi_info_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp) { + stbi__context s; + stbi__start_mem(&s, buffer, len); + return stbi__info_main(&s, x, y, comp); } -STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp) -{ - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user); - return stbi__info_main(&s,x,y,comp); +STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const * c, void * user, int * x, int * y, int * comp) { + stbi__context s; + stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user); + return stbi__info_main(&s, x, y, comp); } -STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len) -{ - stbi__context s; - stbi__start_mem(&s,buffer,len); - return stbi__is_16_main(&s); +STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const * buffer, int len) { + stbi__context s; + stbi__start_mem(&s, buffer, len); + return stbi__is_16_main(&s); } -STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user) -{ - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user); - return stbi__is_16_main(&s); +STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * c, void * user) { + stbi__context s; + stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user); + return stbi__is_16_main(&s); } #endif // STB_IMAGE_IMPLEMENTATION @@ -7867,12 +8279,9 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user 1.30 (2011-06-11) added ability to load files via callbacks to accomidate custom input streams (Ben Wenger) removed deprecated format-specific test/load functions - removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway - error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha) - fix inefficiency in decoding 32-bit BMP (David Woo) - 1.29 (2010-08-16) - various warning fixes from Aurelien Pocheville - 1.28 (2010-08-01) + removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks + anyway error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in + decoding 32-bit BMP (David Woo) 1.29 (2010-08-16) various warning fixes from Aurelien Pocheville 1.28 (2010-08-01) fix bug in GIF palette transparency (SpartanJ) 1.27 (2010-08-01) cast-to-stbi_uc to fix warnings @@ -7944,7 +8353,6 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user first released version */ - /* ------------------------------------------------------------------------------ This software is available under 2 licenses -- choose whichever you prefer. diff --git a/common/train.cpp b/common/train.cpp new file mode 100644 index 000000000..fef1e57c9 --- /dev/null +++ b/common/train.cpp @@ -0,0 +1,1513 @@ +#include "train.h" +#include "common.h" + +#include +#include +#include + +struct random_normal_distribution { + std::mt19937 gen; + std::normal_distribution rd; + float min; + float max; +}; + +struct random_uniform_distribution { + std::mt19937 gen; + std::uniform_real_distribution rd; +}; + +struct train_state * init_train_state() { + struct train_state * state = new struct train_state; + state->train_its = 0; + state->train_samples = 0; + state->train_tokens = 0; + state->train_epochs = 0; + state->shuffle_samples_hash = 0; + state->shuffle_sample_count = 0; + state->shuffle_next_sample = 0; + state->shuffle_rng_state_current = ""; + state->shuffle_rng_state_next = ""; + + state->opt = new struct ggml_opt_context; + state->opt->ctx = NULL; + state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); + state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; + state->opt->loss_after = 0.0f; + + return state; +} + +void free_train_state(struct train_state * state) { + delete state->opt; + delete state; +} + +struct random_normal_distribution * init_random_normal_distribution( + int seed, float mean, float std, float min, float max +) { + struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution)); + rnd->gen = std::mt19937(seed); + rnd->rd = std::normal_distribution{mean, std}; + rnd->min = min; + rnd->max = max; + return rnd; +} + +struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) { + struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution)); + rnd->gen = std::mt19937(seed); + rnd->rd = std::uniform_real_distribution{min, max}; + return rnd; +} + +void free_random_normal_distribution (struct random_normal_distribution * rnd) { + free(rnd); +} + +void free_random_uniform_distribution(struct random_uniform_distribution * rnd) { + free(rnd); +} + +struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { + float scale = 1.0f; // xavier + switch (ggml_n_dims(tensor)) { + case 1: + scale /= sqrtf((float) tensor->ne[0]); + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = scale * frand_normal(rnd); + } + break; + case 2: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = scale * frand_normal(rnd); + } + } + break; + case 3: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = scale * frand_normal(rnd); + } + } + } + break; + case 4: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = scale * frand_normal(rnd); + } + } + } + } + break; + default: + die("Unsupported tensor->n_dims"); + }; + return tensor; +} + +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { + switch (ggml_n_dims(tensor)) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = frand_uniform(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = frand_uniform(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = frand_uniform(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = frand_uniform(rnd); + } + } + } + } + break; + default: + die("Unsupported tensor->n_dims"); + }; + return tensor; +} + +float frand() { + return (float)rand()/((float)(RAND_MAX) + 1.0f); +} + +float frand_normal(struct random_normal_distribution * rnd) { + return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); +} + +float frand_uniform(struct random_uniform_distribution * rnd) { + return rnd->rd(rnd->gen); +} + +int clamp(const int v, const int min, const int max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float fclamp(const float v, const float min, const float max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == 1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); +} + +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); +} + +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == 1); +} + +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == ne3); +} + +int64_t get_example_targets_batch( + struct llama_context * lctx, + struct ggml_tensor * tokens_input, + struct ggml_tensor * target_probs, + int64_t example_id, + const size_t * samples_offs, + const size_t * samples_begin, + const size_t * samples_size, + size_t samples_count, + const llama_token * train_data, + size_t n_train_data, + bool separate_with_eos, + bool separate_with_bos, + bool fill_with_next_samples, + bool sample_random_offsets +) { + GGML_ASSERT(samples_count > 0); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(target_probs)); + int64_t n_vocab = target_probs->ne[0]; + int64_t n_tokens = tokens_input->ne[0]; + int64_t n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_vocab == target_probs->ne[0]); + GGML_ASSERT(n_tokens == target_probs->ne[1]); + GGML_ASSERT(n_batch == target_probs->ne[2]); + + int64_t used_samples = 0; + + ggml_set_f32(target_probs, 0.0f); + llama_token bos = llama_token_bos(llama_get_model(lctx)); + llama_token eos = llama_token_eos(llama_get_model(lctx)); + // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); + for (int k=0; k= sample_size && fill_with_next_samples) { + if (!sample_separation_eos) { + // insert eos token to separate samples + sample_separation_eos = true; + } else if (!sample_separation_bos) { + // insert bos token to separate samples + sample_separation_bos = true; + token = bos; + } else { + // sample separation is done, continue with next sample + sample_separation_eos = !separate_with_eos; + sample_separation_bos = !separate_with_bos; + sample_offs = 0; + sample_idx = (example_id + used_samples) % samples_count; + sample_begin = samples_begin[sample_idx]; + sample_size = samples_size[sample_idx]; + ++used_samples; + } + } + // note: no else-if here + if (sample_offs < sample_size) { + token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1)); + ++sample_offs; + } + ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f); + if (i+1> rng; +} + +std::string mt19937_get_state(const std::mt19937& rng) { + std::stringstream s_rng_state; + s_rng_state.imbue(std::locale::classic()); + s_rng_state << rng; + return s_rng_state.str(); +} + +std::string mt19937_seed_to_state(unsigned seed) { + std::mt19937 rng(seed); + return mt19937_get_state(rng); +} + +std::string shuffle_samples( + const std::string & rng_state, + size_t * shuffled_offs, + size_t * shuffled_begins, + size_t * shuffled_sizes, + const size_t * begins, + const size_t * sizes, + size_t count) { + if (count == 0) return rng_state; + + std::mt19937 rng; + mt19937_set_state(rng, rng_state); + + // sort indices by random value for each index + std::vector idcs; + { + std::vector rnd; + idcs.resize(count); + rnd.resize(count); + for (unsigned i=0; i h_string; + std::hash h_ull; + size_t h = h_string(std::string(fn)); + h = hash_combine(h, h_ull((unsigned long long) sample_count)); + for (size_t i=0; i< sample_count; ++i) { + h = hash_combine(h, h_ull((unsigned long long) samples_begin[i])); + h = hash_combine(h, h_ull((unsigned long long) samples_size[i])); + } + return h; +} + +std::string replace_str(const char * s, const char * needle, const char * replacement) { + std::string str = s; + size_t pos = str.find(needle); + if (pos != std::string::npos) { + str.replace(pos, strlen(needle), replacement); + } + return str; +} + +void print_duration(double fmillis) { + if (fmillis < 1000.0f) { + printf("%.1fms", (float) fmillis); + return; + } + const int64_t one_sec = 1000; + const int64_t one_min = one_sec * 60; + const int64_t one_hour = one_min * 60; + const int64_t one_day = one_hour * 24; + + int64_t millis = (int64_t) fmillis; + int64_t days = millis/one_day; + int64_t hours = (millis - days*one_day)/one_hour; + int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; + int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; + + // to print int64_t either cast to (long long int) or use macro PRId64 from + if (days > 0) { + printf("%lldd ", (long long int) days); + } + printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds); +} + +float cosine_decay(int64_t step, int64_t decay_steps, float minimum) { + if (step > decay_steps) { + step = decay_steps; + } + const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); + const float decay = (1 - minimum)*cosine_decay + minimum; + return decay; +} + +float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int64_t) (restart_step_mult * decay_steps); + } + return cosine_decay(step, decay_steps, minimum); +} + +float learning_schedule( + int64_t step, + int64_t warmup_steps, + int64_t cos_decay_steps, + float learning_rate, + float overall_minimum, + float cos_decay_minimum, + float cos_decay_restart_step_mult, + bool enable_restart) { + + float result = + (step < warmup_steps) + ? (float) step / (float) warmup_steps + : enable_restart + ? cosine_decay_restart( + step - warmup_steps, + cos_decay_steps, + cos_decay_minimum, + cos_decay_restart_step_mult) + : cosine_decay( + step, + cos_decay_steps, + cos_decay_minimum); + + float min = overall_minimum / learning_rate; + result = min + result * (1.0f - min); + return result; +} + +static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(a != NULL); + GGML_ASSERT(b != NULL); + GGML_ASSERT(a->type == b->type); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); + + return true; +} + +void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { + if (dst == NULL) { + return; + } + struct ggml_tensor * t = ggml_get_tensor(ctx, name); + GGML_ASSERT(are_same_layout(dst, t)); + memcpy(dst->data, t->data, ggml_nbytes(t)); + + if (strlen(ggml_get_name(dst)) == 0) { + ggml_set_name(dst, name); + } +} + +// gguf constants +static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; +static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; +static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; +static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; +static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; +static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; +static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; +static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; +static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; +static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; +static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; +static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; +static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; +static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; + +static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; +static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; +static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; + +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; + +static const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; +static const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; +static const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; +static const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; +static const char * LLM_KV_TRAINING_EPOCH_COUNT = "training.epoch_count"; +static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash"; +static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE = "training.shuffle.rng_state"; +static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count"; +static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE = "training.shuffle.next_sample"; + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + die_fmt("key not found in model: %s", skey.c_str()); \ + } \ +} + +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); + GGML_ASSERT(file_version == 0); + + GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); + GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); + GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); + + uint64_t nx; + GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); + opt->nx = (size_t) nx; + + // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know + + std::string opt_type; + GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); + if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { + opt->params.type = GGML_OPT_TYPE_ADAM; + + GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); + GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); + + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + copy_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { + opt->params.type = GGML_OPT_TYPE_LBFGS; + + GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); + GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); + GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); + GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); + GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); + GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); + + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + copy_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + copy_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + copy_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + copy_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + copy_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + copy_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + copy_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + copy_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + } else { + die("unknown optimizer type\n"); + } +} + +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); + gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); + gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); + + switch (opt->params.type) { + case GGML_OPT_TYPE_ADAM: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); + + ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + if (opt->adam.pf) { + ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } + + gguf_add_tensor(fctx, opt->adam.m); + gguf_add_tensor(fctx, opt->adam.v); + if (opt->adam.pf) { + gguf_add_tensor(fctx, opt->adam.pf); + } + } break; + case GGML_OPT_TYPE_LBFGS: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); + + ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + if (opt->lbfgs.pf) { + ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + } + ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + + gguf_add_tensor(fctx, opt->lbfgs.x); + gguf_add_tensor(fctx, opt->lbfgs.xp); + gguf_add_tensor(fctx, opt->lbfgs.g); + gguf_add_tensor(fctx, opt->lbfgs.gp); + gguf_add_tensor(fctx, opt->lbfgs.d); + if (opt->lbfgs.pf) { + gguf_add_tensor(fctx, opt->lbfgs.pf); + } + gguf_add_tensor(fctx, opt->lbfgs.lmal); + gguf_add_tensor(fctx, opt->lbfgs.lmys); + gguf_add_tensor(fctx, opt->lbfgs.lms); + gguf_add_tensor(fctx, opt->lbfgs.lmy); + } break; + } +} + +bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) { + if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) { + return false; + } + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); + GGML_ASSERT(file_version <= 1); + + if (file_version == 0) { + + GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); + GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); + + } else if (file_version == 1) { + + GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT); + GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT); + GGUF_GET_KEY(fctx, train->train_epochs, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT); + + GGUF_GET_KEY(fctx, train->shuffle_samples_hash, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH); + GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE); + GGUF_GET_KEY(fctx, train->shuffle_sample_count, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, train->shuffle_next_sample, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE); + } + + load_opt_context_gguf(fctx, f_ggml_ctx, train->opt); + return true; +} + +void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) { + gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 1); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, train->train_samples); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT, train->train_tokens); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT, train->train_epochs); + + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash); + gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE, train->shuffle_rng_state_current.c_str()); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE, (uint64_t) train->shuffle_next_sample); + + save_opt_context_gguf(fctx, train->opt); +} + + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +// mark each byte with its utf8 unit number. +// returns the number of utf8 characters. +// e.g. when bytes == '\x61\xD0\xB0\x62', +// then utf8_units will become [0,0,1,0] +// utf8_nunits will become [1,2,2,1] and 3 is returned. +// bytes where utf8_units is zero, are the begin of an utf8 character. +static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) { + size_t offs = 0; + size_t count_utf8 = 0; + while(offs < count) { + int len = (int) utf8_len(bytes[offs]); + for (int i=0; i & out_tokens, + std::vector & out_samples_begin, + std::vector & out_samples_size) { + struct llama_file f(filename, "rb"); + + if (f.size == 0) { + out_tokens.clear(); + out_samples_begin.clear(); + out_samples_size.clear(); + printf("%s: warning: empty or not existing training data file '%s'\n", + __func__, filename); + return out_tokens.size(); + } + + // account for possible leading whitespace that will be added by tokenizer + // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] + const int n_max_tokens_overhead = 1; + + std::vector buf; + buf.resize(f.size); + + f.read_raw(buf.data(), f.size); + + std::vector utf8_units; + std::vector utf8_nunits; + utf8_units.resize(buf.size()); + utf8_nunits.resize(buf.size()); + mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size()); + + if (sample_start.size() == 0) { + // tokenize all data at once + out_tokens.resize(buf.size() + n_max_tokens_overhead); + + int n_tokens = llama_tokenize( + llama_get_model(lctx), + buf.data(), + (int) buf.size(), + out_tokens.data(), + (int) out_tokens.size(), + false, false); + if (n_tokens < 0) { + out_tokens.resize(-n_tokens); + n_tokens = llama_tokenize( + llama_get_model(lctx), + buf.data(), + (int) buf.size(), + out_tokens.data(), + (int) out_tokens.size(), + false, false); + } + if (n_tokens >= 0) { + out_tokens.resize(n_tokens); + } + + // generate sample starts at all token positions + out_samples_begin.clear(); + out_samples_begin.push_back(0); + out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); + size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; + for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { + out_samples_begin.push_back(sample_begin); + out_samples_size.push_back(context_length); + } + } else { + // split data into samples and tokenize each sample + std::string data_str(buf.data(), buf.size()); + out_samples_begin.clear(); + out_samples_size.clear(); + out_tokens.clear(); + + // find all positions of pattern sample_start + size_t sample_begin = data_str.find(sample_start, 0); + while (sample_begin != std::string::npos) { + out_samples_begin.push_back(sample_begin); + const size_t search_start = sample_begin + sample_start.size(); + sample_begin = data_str.find(sample_start, search_start); + } + if (out_samples_begin.size() == 0) { + printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n", + __func__, sample_start.c_str()); + out_samples_begin.push_back(0); + } + + out_samples_size.resize(out_samples_begin.size(), 0); + + std::vector buf_sample; + std::vector tok_sample; + + const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); + size_t found_too_big_sample = 0; + size_t found_too_small_sample = 0; + size_t found_empty_sample = 0; + size_t found_min_sample_size = SIZE_MAX; + size_t found_max_sample_size = 0; + + size_t max_token_text_size = 0; + int n_vocab = llama_n_vocab(llama_get_model(lctx)); + for (llama_token token=0; token < n_vocab; ++token) { + max_token_text_size = std::max( + max_token_text_size, + strlen(llama_token_get_text(llama_get_model(lctx), token))); + } + + // upper bound of context byte length. + // strings with this byte length should always tokenize to at least context_length tokens. + size_t context_byte_len = max_token_text_size*context_length; + + for (unsigned i=0; i 0) { + // sample end is in the middle of an utf8 character. + // advance sample_end to the begin of the next utf8 character. + sample_end += utf8_nunits[sample_end] - utf8_units[sample_end]; + } + size_t sample_size = sample_end - sample_begin; + if (sample_size == 0) { + ++found_empty_sample; + } + + if (sample_size > 0) { + // llama_tokenize expects zero terminated string, + // copy sample into buffer and zero terminate it. + buf_sample.resize(sample_size); + memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); + + // printf("sample: '%s'\n", buf_sample.data()); + + // tokenize the sample + tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); + int n_tokens = llama_tokenize(llama_get_model(lctx), + buf_sample.data(), + (int) buf_sample.size(), + tok_sample.data(), + (int) tok_sample.size(), + false, false); + if (n_tokens < 0) { + tok_sample.resize(-n_tokens); + n_tokens = llama_tokenize(llama_get_model(lctx), + buf_sample.data(), + (int) buf_sample.size(), + tok_sample.data(), + (int) tok_sample.size(), + false, false); + GGML_ASSERT(n_tokens >= 0); + } + GGML_ASSERT(n_tokens <= (int) tok_sample.size()); + + if ((size_t) n_tokens > context_length) { + ++found_too_big_sample; + } else if ((size_t) n_tokens < context_length) { + ++found_too_small_sample; + } + found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens); + found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens); + + // write out tokens, start and size of sample + // overwrite the string start position with the token start position + out_samples_begin[i] = out_tokens.size(); + out_samples_size[i] = (size_t) n_tokens; + out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens); + } else { + out_samples_begin[i] = out_tokens.size(); + out_samples_size[i] = 0; + } + + } + if (found_too_big_sample > 0) { + printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n", + __func__, found_too_big_sample, found_max_sample_size, context_length); + } + + if (found_too_small_sample > 0) { + printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n", + __func__, found_too_small_sample, found_min_sample_size, context_length); + } + + if (found_empty_sample) { + printf("%s: warning: found %zu empty samples.\n", + __func__, found_empty_sample); + } + } + printf("%s: total number of samples: %zu\n", + __func__, out_samples_begin.size()); + + GGML_ASSERT(out_samples_begin.size() == out_samples_size.size()); + + return out_tokens.size(); +} + +std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) { + std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest); + return replace_str(filename, pattern_it, sit.c_str()); +} + +struct train_params_common get_default_train_params_common() { + struct train_params_common params; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.gguf"; + params.fn_checkpoint_out = "checkpoint-ITERATION.gguf"; + params.pattern_fn_it = "ITERATION"; + params.fn_latest = "LATEST"; + + params.print_usage = false; + + params.save_every = 10; + + params.seed = -1; + + params.n_ctx = 128; + params.n_threads = 6; + params.n_batch = 8; + params.n_gradient_accumulation = 1; + params.n_epochs = -1; + params.n_gpu_layers = 0; + + params.custom_n_ctx = false; + + params.use_flash = false; + params.use_checkpointing = true; + + params.sample_start = ""; + params.include_sample_start = false; + params.escape = false; + params.overlapping_samples = false; + params.fill_with_next_samples = false; + params.separate_with_eos = false; + params.separate_with_bos = true; + params.sample_random_offsets = false; + params.force_reshuffle = false; + + params.opt_past = 0; + params.opt_delta = 1e-5f; + params.opt_max_no_improvement = 0; + + params.warmup = 100; + params.cos_decay_steps = 1000; + params.cos_decay_restart = 1.1f; + params.cos_decay_min = 0.1f; + params.enable_restart = false; + + params.adam_n_iter = 256; + params.adam_alpha = 1e-3f; + params.adam_min_alpha = 0; + params.adam_decay = 1e-1f; + params.adam_decay_min_ndim = 2; + params.adam_beta1 = 0.9f; + params.adam_beta2 = 0.999f; + params.adam_gclip = 1.0f; + params.adam_eps_f = 0.0f; + + return params; +} + +void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) { + // fprintf(stderr, "usage: %s [options]\n", argv[0]); + // fprintf(stderr, "\n"); + // fprintf(stderr, "options:\n"); + // fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); + fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); + fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); + fprintf(stderr, " --pattern-fn-it STR pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it); + fprintf(stderr, " --fn-latest STR string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest); + fprintf(stderr, " --save-every N save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); + fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); + fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); + fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); + fprintf(stderr, " --grad-acc N Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation); + fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str()); + fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n"); + fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stderr, " --overlapping-samples Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n"); + fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n"); + fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : ""); + fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : ""); + fprintf(stderr, " --no-separate-with-eos When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : ""); + fprintf(stderr, " --no-separate-with-bos When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : ""); + fprintf(stderr, " --sample-random-offsets Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : ""); + fprintf(stderr, " --force-reshuffle Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n"); + fprintf(stderr, " --no-flash Don't use flash attention \n"); + fprintf(stderr, " --use-flash Use flash attention (default)\n"); + fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); + fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); + fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); + fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); + fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); + fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); + fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); + fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); + fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); + fprintf(stderr, " --epochs N Maximum number epochs to process. (default %d)\n", params->n_epochs); + fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); + fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); + fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); + fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); + fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); + fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); + fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); + fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); + fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers); + fprintf(stderr, "\n"); +} + +bool consume_common_train_arg( + int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param +) { + int& i = *idx; + std::string arg = argv[i]; + const std::string arg_prefix = "--"; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg == "--train-data") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_train_data = argv[i]; + } else if (arg == "--checkpoint-in") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_checkpoint_in = argv[i]; + } else if (arg == "--checkpoint-out") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_checkpoint_out = argv[i]; + } else if (arg == "--pattern-fn-it") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->pattern_fn_it = argv[i]; + } else if (arg == "--fn-latest") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_latest = argv[i]; + } else if (arg == "--save-every") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->save_every = std::stoi(argv[i]); + } else if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->seed = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_ctx = std::stoi(argv[i]); + params->custom_n_ctx = true; + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_batch = std::stoi(argv[i]); + } else if (arg == "--grad-acc") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_gradient_accumulation = std::max(1, std::stoi(argv[i])); + } else if (arg == "--sample-start") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->sample_start = std::string(argv[i]); + } else if (arg == "--escape") { + params->escape = true; + } else if (arg == "--include-sample-start") { + params->include_sample_start = true; + } else if (arg == "--overlapping-samples") { + params->overlapping_samples = true; + } else if (arg == "--fill-with-next-samples") { + params->fill_with_next_samples = true; + } else if (arg == "--separate-with-eos") { + params->separate_with_eos = true; + } else if (arg == "--separate-with-bos") { + params->separate_with_bos = true; + } else if (arg == "--no-separate-with-eos") { + params->separate_with_eos = false; + } else if (arg == "--no-separate-with-bos") { + params->separate_with_bos = false; + } else if (arg == "--sample-random-offsets") { + params->sample_random_offsets = true; + } else if (arg == "--force-reshuffle") { + params->force_reshuffle = true; + } else if (arg == "--no-flash") { + params->use_flash = false; + } else if (arg == "--use-flash") { + params->use_flash = true; + } else if (arg == "--no-checkpointing") { + params->use_checkpointing = false; + } else if (arg == "--use-checkpointing") { + params->use_checkpointing = true; + } else if (arg == "--warmup") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->warmup = std::stoi(argv[i]); + } else if (arg == "--cos-decay-steps") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->cos_decay_steps = std::stoi(argv[i]); + } else if (arg == "--cos-decay-restart") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->cos_decay_restart = std::stof(argv[i]); + } else if (arg == "--cos-decay-min") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->cos_decay_min = std::stof(argv[i]); + } else if (arg == "--enable-restart") { + params->enable_restart = true; + } else if (arg == "--disable-restart") { + params->enable_restart = false; + } else if (arg == "--opt-past") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->opt_past = std::stoi(argv[i]); + } else if (arg == "--opt-delta") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->opt_delta = std::stof(argv[i]); + } else if (arg == "--opt-max-no-improvement") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->opt_max_no_improvement = std::stoi(argv[i]); + } else if (arg == "--adam-epsf") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_eps_f = std::stof(argv[i]); + } else if (arg == "--epochs") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_epochs = std::stoi(argv[i]); + } else if (arg == "--adam-iter") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_n_iter = std::stoi(argv[i]); + } else if (arg == "--adam-alpha") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_alpha = std::stof(argv[i]); + } else if (arg == "--adam-min-alpha") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_min_alpha = std::stof(argv[i]); + } else if (arg == "--adam-decay") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_decay = std::stof(argv[i]); + } else if (arg == "--adam-decay-min-ndim") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_decay_min_ndim = std::stoi(argv[i]); + } else if (arg == "--adam-beta1") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_beta1 = std::stof(argv[i]); + } else if (arg == "--adam-beta2") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_beta2 = std::stof(argv[i]); + } else if (arg == "--adam-gclip") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_gclip = std::stof(argv[i]); + } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + if (llama_supports_gpu_offload()) { + params->n_gpu_layers = std::stoi(argv[i]); + } else { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } else if (arg == "-h" || arg == "--help") { + params->print_usage = true; + return true; + } else { + return false; + } + return true; +} + +void finish_processing_train_args(struct train_params_common * params) { + if (params->escape) { + string_process_escapes(params->sample_start); + } +} + +void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) { + struct train_opt_callback_data * data = (struct train_opt_callback_data *) vdata; + struct train_params_common * params = data->params; + struct train_state * train = data->train; + struct ggml_opt_context * opt = train->opt; + int n_batch = params->n_batch; + int n_ctx = params->n_ctx; + + if (accum_step == 0) { + // time measurement + int64_t now = ggml_time_ms(); + if (now > data->last_time && opt->iter > data->first_iter) { + double dt = (double) (now - data->last_time); + if (data->millis_per_iter == 0.0) { + data->millis_per_iter = dt; + } else { + const double gain = 0.7; + data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain; + } + } + + double remaining_millis = 0.0; + if (data->millis_per_iter > 0.0) { + const int n_iter = params->adam_n_iter; + const int done_iter = opt->iter - data->first_iter; + const int remaining_iter = n_iter - done_iter; + remaining_millis = remaining_iter * data->millis_per_iter; + } + + // file saving + const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every); + if (save_now) { + int new_iters = opt->iter - data->last_save_iter; + train->train_its += new_iters; + train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx; + + if (data->save_cb) { + data->save_cb(data->save_data, train); + } + + data->last_save_iter = opt->iter; + } + + // exclude file saving from time measurement, by measuring last_time after saving + data->last_time = ggml_time_ms(); + + *sched = learning_schedule( + opt->iter, + params->warmup, + params->cos_decay_steps, + params->adam_alpha, + params->adam_min_alpha, + params->cos_decay_min, + params->cos_decay_restart, + params->enable_restart); + + int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); + if (impr_plot > 0) impr_plot = 0; + if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0; + printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f", + __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count, + *sched, opt->loss_after); + + + if (data->millis_per_iter > 0) { + printf(" dt="); + print_duration(data->millis_per_iter); + printf(" eta="); + print_duration(remaining_millis); + } + + float improvement = opt->loss_before - opt->loss_after; + const float plot_scale = 10.0f; + int bar_len = (int)(1 + improvement*plot_scale + 0.5); + printf(" |"); + for (int i=0; i"); + printf("\n"); + } + + int64_t used_samples = get_example_targets_batch( + data->lctx, + data->tokens_input, + data->target_probs, + train->shuffle_next_sample, + data->shuffled_samples_offs, + data->shuffled_samples_begin, + data->shuffled_samples_size, + data->samples_count, + data->tokens_data, + data->tokens_size, + params->separate_with_eos, + params->separate_with_bos, + params->fill_with_next_samples, + params->sample_random_offsets); + + train->train_samples += used_samples; + train->shuffle_next_sample += used_samples; + + if (train->shuffle_next_sample >= train->shuffle_sample_count) { + ++train->train_epochs; + printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs); + // note: we may have used some samples from the current shuffling more than once + train->shuffle_rng_state_current = train->shuffle_rng_state_next; + train->shuffle_rng_state_next = shuffle_samples( + train->shuffle_rng_state_current, + data->shuffled_samples_offs, + data->shuffled_samples_begin, + data->shuffled_samples_size, + data->samples_begin, + data->samples_size, + data->samples_count); + train->shuffle_next_sample = 0; + } + + const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs); + if (last_epoch_reached) { + // allow optimization iteration at last epoch to be completed before canceling + if (data->iter_at_last_epoch < 0) { + data->iter_at_last_epoch = opt->iter; + } else if (opt->iter > data->iter_at_last_epoch) { + *cancel = true; + } + } +} diff --git a/common/train.h b/common/train.h new file mode 100644 index 000000000..263d940c0 --- /dev/null +++ b/common/train.h @@ -0,0 +1,233 @@ +// Various helper functions and utilities for training + +#pragma once + +#include +#include +#include + +#include "ggml.h" +#include "llama.h" + +#define LLAMA_TRAIN_MAX_NODES 16384 + +typedef std::string mt19937_state; + +struct train_state { + struct ggml_opt_context * opt; + + uint64_t train_its; + uint64_t train_samples; + uint64_t train_tokens; + uint64_t train_epochs; + + size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes) + mt19937_state shuffle_rng_state_current; + mt19937_state shuffle_rng_state_next; + size_t shuffle_sample_count; + size_t shuffle_next_sample; +}; + +struct train_params_common { + const char * fn_train_data; + const char * fn_checkpoint_in; + const char * fn_checkpoint_out; + const char * pattern_fn_it; + const char * fn_latest; + + bool print_usage; + + int save_every; + + uint32_t seed; + + int n_ctx; + int n_threads; + int n_batch; + int n_gradient_accumulation; + int n_epochs; + int n_gpu_layers; + + bool custom_n_ctx; + + bool use_flash; + bool use_checkpointing; + + std::string sample_start; + bool include_sample_start; + bool escape; + bool overlapping_samples; + bool fill_with_next_samples; + bool separate_with_eos; + bool separate_with_bos; + bool sample_random_offsets; + + bool force_reshuffle; + + int warmup; + int cos_decay_steps; + float cos_decay_restart; + float cos_decay_min; + bool enable_restart; + + int opt_past; + float opt_delta; + int opt_max_no_improvement; + + int adam_n_iter; + float adam_alpha; + float adam_min_alpha; + float adam_decay; + int adam_decay_min_ndim; + float adam_beta1; + float adam_beta2; + float adam_gclip; + float adam_eps_f; +}; + +typedef void (*save_train_files_callback)(void * data, struct train_state * train); + +struct train_opt_callback_data { + struct train_params_common * params; + struct train_state * train; + save_train_files_callback save_cb; + void * save_data; + struct llama_context * lctx; + int last_save_iter; + llama_token * tokens_data; + size_t tokens_size; + size_t * samples_begin; + size_t * samples_size; + size_t * shuffled_samples_offs; + size_t * shuffled_samples_begin; + size_t * shuffled_samples_size; + size_t samples_count; + struct ggml_tensor * tokens_input; + struct ggml_tensor * target_probs; + int first_iter; + int first_epoch; + int iter_at_last_epoch; + int64_t last_time; + double millis_per_iter; +}; + +struct train_state * init_train_state(); +void free_train_state(struct train_state * state); + +struct train_params_common get_default_train_params_common(); +void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params); + +bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param); +void finish_processing_train_args(struct train_params_common * params); + +struct random_normal_distribution; +struct random_uniform_distribution; + +struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max); +struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max); + +void free_random_normal_distribution (struct random_normal_distribution * rnd); +void free_random_uniform_distribution(struct random_uniform_distribution * rnd); + +struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); + +// generate random float in interval [0,1) +float frand(); +float frand_normal (struct random_normal_distribution * rnd); +float frand_uniform(struct random_uniform_distribution * rnd); + +int clamp (const int v, const int min, const int max); +float fclamp(const float v, const float min, const float max); + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0); +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1); +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2); +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); + +size_t tokenize_file( + struct llama_context * lctx, + const char * filename, + const std::string & sample_start, + bool include_sample_start, + bool overlapping_samples, + unsigned context_length, + std::vector & out_tokens, + std::vector & out_samples_begin, + std::vector & out_samples_size); + +int64_t get_example_targets_batch( + struct llama_context * lctx, + struct ggml_tensor * tokens_input, + struct ggml_tensor * target_probs, + int64_t example_id, + const size_t * samples_offs, + const size_t * samples_begin, + const size_t * samples_size, + size_t samples_count, + const llama_token * train_data, + size_t n_train_data, + bool separate_with_eos, + bool separate_with_bos, + bool fill_with_next_samples, + bool sample_random_offsets); + + +void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state); +mt19937_state mt19937_get_state(const std::mt19937& rng); +mt19937_state mt19937_seed_to_state(unsigned seed); + +mt19937_state shuffle_samples( + const mt19937_state & rng_state, + size_t * shuffled_offs, + size_t * shuffled_begins, + size_t * shuffled_sizes, + const size_t * begins, + const size_t * sizes, + size_t count); + +size_t hash_combine(size_t h1, size_t h2); + +size_t compute_samples_hash( + const char* fn, + const size_t* samples_begin, + const size_t* samples_size, + size_t sample_count); + + +std::string replace_str(const char * s, const char * needle, const char * replacement); + +void print_duration(double milliseconds); + +float cosine_decay( + int64_t step, + int64_t decay_steps, + float minimum); + +float cosine_decay_restart( + int64_t step, + int64_t decay_steps, + float minimum, + float restart_step_mult); + +float learning_schedule( + int64_t step, + int64_t warmup_steps, + int64_t decay_steps, + float learning_rate, + float overall_minimum, + float cos_decay_minimum, + float cos_decay_restart_step_mult, + bool enable_restart); + +void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name); + +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt); +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt); + +bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train); +void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train); + +std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration); + +void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 018a2a588..dde4fa9c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3,7 +3,6 @@ from __future__ import annotations -import ast import logging import argparse import contextlib @@ -15,7 +14,6 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast -from itertools import chain import math import numpy as np @@ -72,8 +70,7 @@ class Model: def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None): + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -88,7 +85,7 @@ class Model: self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams + self.hparams = Model.load_hparams(self.dir_model) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -132,14 +129,12 @@ class Model: def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - index_file = self.dir_model / index_name - - if index_file.is_file(): + if len(self.part_names) > 1: self.tensor_names = set() + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(index_file, "r", encoding="utf-8") as f: + with open(self.dir_model / index_name, "r", encoding="utf-8") as f: index: dict[str, Any] = json.load(f) weight_map = index.get("weight_map") if weight_map is None or not isinstance(weight_map, dict): @@ -147,7 +142,6 @@ class Model: self.tensor_names.update(weight_map.keys()) else: self.tensor_names = tensor_names_from_parts - weight_map = {} for part_name in self.part_names: logger.info(f"gguf: loading model part '{part_name}'") @@ -174,17 +168,9 @@ class Model: data = LazyTorchTensor.from_eager(data) yield name, data - # verify tensor name presence and identify potentially missing files - if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) - extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) - missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) - if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") - else: - raise ValueError("Mismatch between weight map and model parts for tensor names:\n" - f"Missing tensors: {missing}\n" - f"Extra tensors: {extra}") + # only verify tensor name presence; it doesn't matter if they are not in the right files + if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: @@ -221,17 +207,17 @@ class Model: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") - if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") - if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -265,19 +251,20 @@ class Model: return [(self.map_tensor_name(name), data_torch)] - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: del name, new_name, bid, n_dims # unused return False - # some models need extra generated tensors (like rope_freqs) - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - return () + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): + for name, data_torch in self.get_tensors(): # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue @@ -295,83 +282,57 @@ class Model: bid = int(part) break - for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - # TODO: why do we squeeze here? - # data = data_torch.squeeze().numpy() - data = data_torch.numpy() - - # if data ends up empty, it means data_torch was a scalar tensor -> restore - if len(data.shape) == 0: - data = data_torch.numpy() - + for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + data: np.ndarray # type hint n_dims = len(data.shape) - data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) + data_dtype = data.dtype + data_qtype: gguf.GGMLQuantizationType | None = None + + # when both are True, f32 should win + extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) + extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - if n_dims <= 1 or new_name.endswith("_norm.weight"): - data_qtype = gguf.GGMLQuantizationType.F32 - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + extra_f32 = any(cond for cond in ( + extra_f32, + n_dims == 1, + new_name.endswith("_norm.weight"), + )) + # Some tensor types are always in float32 - if data_qtype is False and ( - any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.TIME_MIX_FIRST, - gguf.MODEL_TENSOR.TIME_MIX_W1, - gguf.MODEL_TENSOR.TIME_MIX_W2, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, - gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, - gguf.MODEL_TENSOR.POSNET_NORM1, - gguf.MODEL_TENSOR.POSNET_NORM2, - ) - ) - or not new_name.endswith(".weight") - ): - data_qtype = gguf.GGMLQuantizationType.F32 + extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + )) - if data_qtype is False and any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - ) - ): - if self.ftype in ( - gguf.LlamaFileType.MOSTLY_TQ1_0, - gguf.LlamaFileType.MOSTLY_TQ2_0, - ): - # TODO: use Q4_K and Q6_K - data_qtype = gguf.GGMLQuantizationType.F16 + # if f16 desired, convert any float32 2-dim weight tensors to float16 + extra_f16 = any(cond for cond in ( + extra_f16, + (name.endswith(".weight") and n_dims >= 2), + )) - # No override (data_qtype is False), or wants to be quantized (data_qtype is True) - if isinstance(data_qtype, bool): - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: + if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data = gguf.quantize_bf16(data) + assert data.dtype == np.int16 data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: - data_qtype = gguf.GGMLQuantizationType.TQ1_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: - data_qtype = gguf.GGMLQuantizationType.TQ2_0 - else: - raise ValueError(f"Unknown file type: {self.ftype.name}") - try: - data = gguf.quants.quantize(data, data_qtype) - except gguf.QuantError as e: - logger.warning("%s, %s", e, "falling back to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): + data = gguf.quantize_q8_0(data) + assert data.dtype == np.uint8 + data_qtype = gguf.GGMLQuantizationType.Q8_0 + + else: # default to float16 for quantized tensors + if data_dtype != np.float16: + data = data.astype(np.float16) + data_qtype = gguf.GGMLQuantizationType.F16 + + if data_qtype is None: # by default, convert to float32 + if data_dtype != np.float32: + data = data.astype(np.float32) + data_qtype = gguf.GGMLQuantizationType.F32 shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape @@ -478,11 +439,6 @@ class Model: return modelcls return func - @classmethod - def print_registered_models(cls): - for name in sorted(cls._model_classes.keys()): - logger.error(f"- {name}") - @classmethod def from_model_architecture(cls, arch: str) -> type[Model]: try: @@ -535,19 +491,9 @@ class Model: else: token: str = reverse_vocab[i] if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not tokenizer.added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: - # NOTE: this was added for Gemma. - # Encoding and decoding the tokens above isn't sufficient for this case. token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: @@ -591,15 +537,9 @@ class Model: if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" - if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": - # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base - res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" - if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": - # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 - res = "bert-bge-large" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/mosaicml/mpt-7b res = "mpt" @@ -627,9 +567,6 @@ class Model: if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": # ref: https://huggingface.co/databricks/dbrx-base res = "dbrx" - if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": - # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - res = "jina-v1-en" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" @@ -648,7 +585,7 @@ class Model: if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" - if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516": + if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": # ref: https://huggingface.co/THUDM/glm-4-9b-chat res = "chatglm-bpe" if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": @@ -666,39 +603,6 @@ class Model: if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M res = "smollm" - if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": - # ref: https://huggingface.co/bigscience/bloom - res = "bloom" - if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": - # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small - res = "gpt3-finnish" - if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": - # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct - res = "exaone" - if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": - # ref: https://huggingface.co/microsoft/phi-2 - res = "phi-2" - if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": - # ref: https://huggingface.co/facebook/chameleon-7b - res = "chameleon" - if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": - # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 - res = "minerva-7b" - if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": - # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base - res = "roberta-bpe" - if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": - # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct - res = "gigachat" - if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": - # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct - res = "megrez" - if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 - res = "deepseek-v3" - if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B - res = "deepseek-r1-qwen" if res is None: logger.warning("\n") @@ -721,9 +625,6 @@ class Model: return res # Marker: End get_vocab_base_pre - def _set_vocab_none(self) -> None: - self.gguf_writer.add_tokenizer_model("none") - def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -1005,7 +906,7 @@ class GPTNeoXModel(Model): return tensors -@Model.register("BloomForCausalLM", "BloomModel") +@Model.register("BloomForCausalLM") class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM @@ -1560,7 +1461,7 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA @@ -1586,17 +1487,6 @@ class LlamaModel(Model): special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -1613,6 +1503,17 @@ class LlamaModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -1668,35 +1569,6 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - def prepare_tensors(self): super().prepare_tensors() @@ -1707,178 +1579,6 @@ class LlamaModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeciLMForCausalLM") -class DeciModel(Model): - model_arch = gguf.MODEL_ARCH.DECI - - @staticmethod - def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: - # DeciLM-specific code - intermediate_size = int(2 * ffn_mult * n_embd / 3) - return DeciModel._find_multiple(intermediate_size, 256) - - @staticmethod - def _find_multiple(n: int, k: int) -> int: - # DeciLM-specific code - if n % k == 0: - return n - return n + k - (n % k) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] - assert self.block_count == len(_block_configs) - self._num_kv_heads = list() - self._num_heads = list() - _ffn_multipliers = list() - # ***linear attention layer*** - # if n_heads_in_group is None and replace_with_linear is True - # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads - # ***attention-free layer*** - # if n_heads_in_group is None and replace_with_linear is False - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 - # ***normal attention-layer*** - # if n_heads_in_group is not None, then - # _num_kv_heads[il] is num_attention_head // n_heads_in_group and - # _num_heads[il] is num_attention_head - for il in range(len(_block_configs)): - if _block_configs[il]["attention"]["n_heads_in_group"] is None: - if _block_configs[il]["attention"]["replace_with_linear"] is True: - self._num_kv_heads.append(0) - self._num_heads.append(self.hparams["num_attention_heads"]) - else: - self._num_kv_heads.append(0) - self._num_heads.append(0) - else: - self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) - self._num_heads.append(self.hparams["num_attention_heads"]) - _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(_ffn_multipliers) - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) - assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) - self._ffn_dims: list[int] = [ - DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) - for multiplier in _ffn_multipliers - ] - - def set_vocab(self): - # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's - # eos_token from '|eot_id|' to '|end_of_text|' - if self.hparams.get("vocab_size", 128256) == 128256: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - else: - # DeciLM-7B - self._set_vocab_llama_hf() - - def set_gguf_parameters(self): - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(self._ffn_dims) - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_head_count(self._num_heads) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_file_type(self.ftype) - else: # DeciLM-7B - super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B - self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] - assert self.block_count == len(self._num_kv_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - if bid is not None: - if "num_key_value_heads_per_layer" in self.hparams: - n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] - elif "block_configs" in self.hparams: - n_kv_head = self._num_kv_heads[bid] - n_head = self._num_heads[bid] - else: - n_kv_head = self.hparams.get("num_key_value_heads") - else: - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - def prepare_tensors(self): - super().prepare_tensors() - - @Model.register("BitnetForCausalLM") class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET @@ -1891,16 +1591,15 @@ class BitnetModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - def weight_quant(self, weight: Tensor) -> Tensor: + def weight_quant(self, weight): dtype = weight.dtype weight = weight.float() - scale = weight.abs().mean().clamp(min=1e-5) - iscale = 1 / scale - # TODO: multiply by the scale directly instead of inverting it twice - # (this is also unnecessarily doubly inverted upstream) - # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 - result = (weight * iscale).round().clamp(-1, 1) / iscale - return result.type(dtype) + s = 1 / weight.abs().mean().clamp(min=1e-5) + weight = (weight * s).round().clamp(-1, 1) / s + scale = weight.abs().max().unsqueeze(0) + weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) + weight = torch.sign(weight).type(dtype) + return weight.type(dtype), scale.type(torch.float32) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) @@ -1915,9 +1614,11 @@ class BitnetModel(Model): gguf.MODEL_TENSOR.FFN_GATE, ]): # transform weight into 1/0/-1 (in fp32) - data_torch = self.weight_quant(data_torch) - - yield (new_name, data_torch) + weight_torch, scale_torch = self.weight_quant(data_torch) + yield (new_name, weight_torch) + yield (new_name.removesuffix(".weight") + ".scale", scale_torch) + else: + yield (new_name, data_torch) @Model.register("GrokForCausalLM") @@ -2036,7 +1737,7 @@ class DbrxModel(Model): return [(new_name, data_torch)] - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: del name, new_name, bid # unused return n_dims > 1 @@ -2047,97 +1748,19 @@ class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): - super().set_gguf_parameters() - embedding_scale = float(self.hparams["scale_emb"]) - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 - self.gguf_writer.add_residual_scale(residual_scale) - logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") - logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] - self.gguf_writer.add_logit_scale(logit_scale) - logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - if self.hparams.get("rope_scaling") is not None: - if self.hparams["rope_scaling"].get("type") == "longrope": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): - model_arch = gguf.MODEL_ARCH.MINICPM3 - - def set_gguf_parameters(self): - hparams = self.hparams - + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - rope_dims = self.hparams["qk_rope_head_dim"] - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): - self._set_vocab_sentencepiece() + self._set_vocab_llama_hf() def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2149,6 +1772,20 @@ class MiniCPM3Model(Model): .reshape(weights.shape) ) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + @Model.register("QWenLMHeadModel") class QwenModel(Model): @@ -2202,75 +1839,6 @@ class Qwen2Model(Model): except FileNotFoundError: self._set_vocab_gpt2() - def set_gguf_parameters(self): - super().set_gguf_parameters() - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) - - -@Model.register("Qwen2VLForConditionalGeneration") -class Qwen2VLModel(Model): - model_arch = gguf.MODEL_ARCH.QWEN2VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - mrope_section = self.hparams["rope_scaling"]["mrope_section"] - mrope_section += [0] * max(0, 4 - len(mrope_section)) - self.gguf_writer.add_rope_dimension_sections(mrope_section) - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, data in super().get_tensors(): - if name.startswith("visual."): - continue - yield name, data - - -@Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): - model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if \ - name.endswith("codebook.cluster_size") or \ - name.endswith("codebook.embed_avg") or \ - name.endswith("codebook.inited"): - logger.debug(f"Skipping {name!r}") - return [] - - logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") - - return [(self.map_tensor_name(name), data_torch)] - - def set_vocab(self): - self._set_vocab_none() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) - self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) - self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) - self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) - - self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) - self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) - - self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) - self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) - - self.gguf_writer.add_causal_attention(False) - @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): @@ -2400,15 +1968,6 @@ class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): - # Phi-4 model uses GPT2Tokenizer - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - tokenizer_class = tokenizer_config_json['tokenizer_class'] - if tokenizer_class == 'GPT2Tokenizer': - return self._set_vocab_gpt2() - from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -2525,18 +2084,6 @@ class Phi3MiniModel(Model): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) - sliding_window = self.hparams.get("sliding_window") - # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models - if sliding_window is None: - sliding_window = 0 - self.gguf_writer.add_sliding_window(sliding_window) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) @@ -2567,65 +2114,8 @@ class Phi3MiniModel(Model): if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - -@Model.register("PhiMoEForCausalLM") -class PhiMoeModel(Phi3MiniModel): - model_arch = gguf.MODEL_ARCH.PHIMOE - - _experts: list[dict[str, Tensor]] | None = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) - self.gguf_writer.add_expert_count(self.hparams["num_local_experts"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) @Model.register("PlamoForCausalLM") @@ -2885,67 +2375,7 @@ class InternLM2Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): - model_arch = gguf.MODEL_ARCH.LLAMA - - def set_vocab(self): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - if "added_tokens_decoder" in tokenizer_config_json: - for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): - if token_data.get("special"): - token_id = int(token_id) - token = token_data["content"] - special_vocab._set_special_token(token, token_id) - # update eos token - if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: - special_vocab.special_token_ids["eos"] = token_id - - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") +@Model.register("BertModel", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2986,8 +2416,7 @@ class BertModel(Model): # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" # convert to phantom space vocab def phantom(tok): @@ -3011,73 +2440,13 @@ class BertModel(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.startswith("bert."): - name = name[5:] - - if name.endswith(".gamma"): - name = name[:-6] + ".weight" - - if name.endswith(".beta"): - name = name[:-5] + ".bias" - # we are only using BERT for embeddings so we don't need the pooling layer if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): return [] # we don't need these - if name.startswith("cls.predictions"): - return [] - - if name.startswith("cls.seq_relationship"): - return [] - return [(self.map_tensor_name(name), data_torch)] -@Model.register("RobertaModel") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - else: - return super().set_vocab() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - return super().modify_tensors(data_torch, name, bid) - - @Model.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -3108,117 +2477,6 @@ class NomicBertModel(BertModel): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - # realign tokens (see HF tokenizer code) - tokens = [b'', b'', b'', b''] + tokens[3:-1] - scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] - toktypes = [ - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.UNKNOWN, - ] + toktypes[3:-1] - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - return super().modify_tensors(data_torch, name, bid) - - @Model.register("GemmaForCausalLM") class GemmaModel(Model): model_arch = gguf.MODEL_ARCH.GEMMA @@ -3322,165 +2580,7 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): - model_arch = gguf.MODEL_ARCH.RWKV6 - - def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = [''.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_size = self.hparams["head_size"] - hidden_size = self.hparams["hidden_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] - rescale_every_n_layers = self.hparams["rescale_every"] - intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) - time_mix_extra_dim = 64 if hidden_size == 4096 else 32 - time_decay_extra_dim = 128 if hidden_size == 4096 else 64 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - lerp_weights: dict[int, dict[str, Tensor]] = {} - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): - data_torch = data_torch.transpose(0, 1) - - if new_name.endswith("time_mix_w2.weight"): - data_torch = data_torch.permute(0, 2, 1) - - if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: - data_torch = data_torch.squeeze() - - try: - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) - except KeyError: - pass - - # concat time_mix_lerp weights to reduce some cpu overhead - # also reduces the number of tensors in the model - if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: - try: - self.lerp_weights[bid][new_name] = data_torch - except KeyError: - self.lerp_weights[bid] = {new_name: data_torch} - if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) - yield (new_name, data) - return - - yield (new_name, data_torch) - - -@Model.register("RWKV6Qwen2ForCausalLM") -class RWKV6Qwen2Model(Rwkv6Model): - model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - num_attention_heads = self.hparams["num_attention_heads"] - num_key_value_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] - head_size = hidden_size // num_attention_heads - rms_norm_eps = self.hparams["rms_norm_eps"] - intermediate_size = self.hparams["intermediate_size"] - time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 - time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # special parameters for time_mixing in RWKV6QWEN2 - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_token_shift_count(1) - # RWKV6QWEN2 use grouped key/value like GQA - self.gguf_writer.add_head_count_kv(num_key_value_heads) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - for new_name, data in super().modify_tensors(data_torch, name, bid): - if "time_mix_w1" in new_name or "time_mix_w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg - # permute them here to avoid code changes - data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) - if "w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - yield (new_name, data) - continue - yield (new_name, data) - - -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +@Model.register("MambaForCausalLM", "MambaLMHeadModel") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA @@ -3511,10 +2611,7 @@ class MambaModel(Model): # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - use_dt_b_c_norm = False - # For falconmamba we do apply RMS norm on B / DT and C layers - if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): - use_dt_b_c_norm = True + # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model @@ -3522,13 +2619,12 @@ class MambaModel(Model): self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -3555,6 +2651,19 @@ class MambaModel(Model): return [(new_name, data_torch)] + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del n_dims # unused + + return bid is not None and new_name in ( + self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SSM_X, + gguf.MODEL_TENSOR.SSM_DT, + gguf.MODEL_TENSOR.SSM_A, + gguf.MODEL_TENSOR.SSM_D, + ] + ) + @Model.register("CohereForCausalLM") class CommandR2Model(Model): @@ -3574,24 +2683,6 @@ class CommandR2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): - model_arch = gguf.MODEL_ARCH.COHERE2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - rotary_pct = self.hparams["rotary_pct"] - hidden_size = self.hparams["hidden_size"] - num_attention_heads = self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - @Model.register("OlmoForCausalLM") @Model.register("OLMoForCausalLM") class OlmoModel(Model): @@ -3620,71 +2711,6 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): - model_arch = gguf.MODEL_ARCH.OLMO2 - - -@Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): - model_arch = gguf.MODEL_ARCH.OLMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_rms_eps(1e-5) - if (n_experts := self.hparams.get("num_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - - _experts: list[dict[str, Tensor]] | None = None - - # Copied from: Qwen2MoeModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams["num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - # Copied from: Qwen2MoeModel - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - @Model.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -3723,14 +2749,6 @@ class JinaBertV2Model(BertModel): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "bert.", remove the prefix - # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - if name.startswith("bert."): - name = name[5:] - - return super().modify_tensors(data_torch, name, bid) - @Model.register("OpenELMForCausalLM") class OpenELMModel(Model): @@ -3958,99 +2976,7 @@ class ArcticModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_weights_scale(1.0) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - - _experts: list[dict[str, Tensor]] | None = None - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - @Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") class DeepseekV2Model(Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -4072,15 +2998,6 @@ class DeepseekV2Model(Model): self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - - if hparams["scoring_func"] == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif hparams["scoring_func"] == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: @@ -4093,16 +3010,6 @@ class DeepseekV2Model(Model): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # rename e_score_correction_bias tensors - if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") - - # skip Multi-Token Prediction (MTP) layers - block_count = self.hparams["num_hidden_layers"] - match = re.match(r"model.layers.(\d+)", name) - if match and int(match.group(1)) >= block_count: - return [] - # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] @@ -4291,145 +3198,6 @@ class T5Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("T5EncoderModel") -class T5EncoderModel(Model): - model_arch = gguf.MODEL_ARCH.T5ENCODER - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_add_eos_token(True) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.hparams["num_layers"]) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] - - return [(self.map_tensor_name(name), data_torch)] - - @Model.register("JAISLMHeadModel") class JaisModel(Model): model_arch = gguf.MODEL_ARCH.JAIS @@ -4444,7 +3212,10 @@ class JaisModel(Model): # Embeddings scale self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) self.embeddings_scale = self.hparams['mup_embeddings_scale'] elif 'embeddings_scale' in self.hparams: self.embeddings_scale = self.hparams['embeddings_scale'] @@ -4501,7 +3272,10 @@ class JaisModel(Model): if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte tensors.append((new_name, data_torch * self.width_scale)) else: tensors.append((new_name, data_torch)) @@ -4513,7 +3287,7 @@ class JaisModel(Model): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration") class ChatGLMModel(Model): model_arch = gguf.MODEL_ARCH.CHATGLM @@ -4619,15 +3393,47 @@ class ChatGLMModel(Model): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) + vocab_size = hparams["padded_vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size - tokens, toktypes, tokpre = self.get_vocab_base() + tokpre = self.get_vocab_base_pre(tokenizer) + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[ChatGLMModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) >= 2 and len(merged) <= 7 + merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.get_added_vocab() + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) @@ -4638,20 +3444,16 @@ class ChatGLMModel(Model): def set_gguf_parameters(self): n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) + n_head_kv = self.hparams.get("multi_query_group_num", n_head) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) - self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"])) + self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed)) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - if "attention_dim" in self.hparams: - rope_dim = self.hparams["attention_dim"] - else: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(64) self.gguf_writer.add_add_bos_token(False) rope_freq = 10000 if "rope_ratio" in self.hparams: @@ -4661,223 +3463,12 @@ class ChatGLMModel(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."): + if name.endswith(".rotary_pos_emb.inv_freq"): return [] name = name.removeprefix("transformer.") return [(self.map_tensor_name(name), data_torch)] - -@Model.register("NemotronForCausalLM") -class NemotronModel(Model): - model_arch = gguf.MODEL_ARCH.NEMOTRON - - def set_vocab(self): - self._set_vocab_sentencepiece() - self.gguf_writer.add_pad_token_id(0) - self.gguf_writer.add_unk_token_id(1) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - - # * Partial RoPE - rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - - # * RopeScaling for Nemotron - if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side - # model.layers.{l}.input_layernorm.weight - # model.layers.{l}.post_attention_layernorm.weight - # model.norm.weight - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - return [(self.map_tensor_name(name), data_torch)] - - -@Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): - model_arch = gguf.MODEL_ARCH.EXAONE - - def set_gguf_parameters(self): - hparams = self.hparams - - assert (hparams["activation_function"] == "silu") - - max_position_embeddings = hparams["max_position_embeddings"] - embed_dim = hparams["hidden_size"] - num_heads = hparams["num_attention_heads"] - num_kv_heads = hparams.get("num_key_value_heads", num_heads) - layer_norm_eps = hparams["layer_norm_epsilon"] - intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim - num_layers = hparams["num_layers"] - # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 - # attention_dropout_rate = hparams["attention_dropout"] - # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 - # embed_dropout_rate = hparams["embed_dropout"] - self.gguf_writer.add_embedding_length(embed_dim) - self.gguf_writer.add_head_count(num_heads) - self.gguf_writer.add_head_count_kv(num_kv_heads) - self.gguf_writer.add_context_length(max_position_embeddings) - self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_block_count(num_layers) - self.gguf_writer.add_file_type(self.ftype) - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) - rotary_factor = rotary_factor if rotary_factor is not None else 1.0 - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]: - if hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@Model.register("GraniteForCausalLM") -class GraniteModel(LlamaModel): - """Conversion for IBM's GraniteForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE - - def set_gguf_parameters(self): - """Granite uses standard llama parameters with the following differences: - - - No head_dim support - - New multiplier params: - - attention_scale - - embedding_scale - - residual_scale - - logits_scaling - """ - if head_dim := self.hparams.pop("head_dim", None): - logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) - super().set_gguf_parameters() - # NOTE: Convert _multiplier params to _scale params for naming - # consistency - if attention_scale := self.hparams.get("attention_multiplier"): - self.gguf_writer.add_attention_scale(attention_scale) - logger.info("gguf: (granite) attention_scale = %s", attention_scale) - if embedding_scale := self.hparams.get("embedding_multiplier"): - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) - if residual_scale := self.hparams.get("residual_multiplier"): - self.gguf_writer.add_residual_scale(residual_scale) - logger.info("gguf: (granite) residual_scale = %s", residual_scale) - if logits_scale := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scale) - logger.info("gguf: (granite) logits_scale = %s", logits_scale) - - -@Model.register("GraniteMoeForCausalLM") -class GraniteMoeModel(GraniteModel): - """Conversion for IBM's GraniteMoeForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE_MOE - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - """In modeling_granitemoe, the JetMoe implementation of parallel experts - is used. This essentially merges w1 and w3 into a single tensor with 2x - the hidden size that is then split during forward. To keep compatibility - with existing mixtral support, we pull them apart here. - """ - - if name.endswith("block_sparse_moe.input_linear.weight"): - ffn_dim = self.hparams["intermediate_size"] - assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" - gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), - ] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(Model): - model_arch = gguf.MODEL_ARCH.CHAMELEON - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) - - def set_vocab(self): - self._set_vocab_gpt2() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # ignore image tokenizer for now - # TODO: remove this once image support is implemented for Chameleon - if name.startswith("model.vqmodel"): - return [] - - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - hidden_dim = self.hparams.get("hidden_size") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - if name.endswith(("q_norm.weight", "q_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) - if name.endswith(("k_norm.weight", "k_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) - - return [(self.map_tensor_name(name), data_torch)] - - # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 - @staticmethod - def _reverse_hf_permute(data_torch, n_heads, hidden_dim): - head_dim = hidden_dim // n_heads - data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) - data_torch = data_torch.repeat_interleave(n_heads, 0) - return data_torch - - ###### CONVERSION LOGIC ###### @@ -4959,8 +3550,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -4969,7 +3560,6 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "model", type=Path, help="directory containing model file", - nargs="?", ) parser.add_argument( "--use-temp-file", action="store_true", @@ -5007,15 +3597,8 @@ def parse_args() -> argparse.Namespace: "--metadata", type=Path, help="Specify the path for an authorship metadata override file" ) - parser.add_argument( - "--print-supported-models", action="store_true", - help="Print the supported models" - ) - args = parser.parse_args() - if not args.print_supported_models and args.model is None: - parser.error("the following arguments are required: model") - return args + return parser.parse_args() def split_str_to_n_bytes(split_str: str) -> int: @@ -5039,11 +3622,6 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() - if args.print_supported_models: - logger.error("Supported models:") - Model.print_registered_models() - sys.exit(0) - if args.verbose: logging.basicConfig(level=logging.DEBUG) else: @@ -5060,8 +3638,6 @@ def main() -> None: "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, - "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, "auto": gguf.LlamaFileType.GUESSED, } diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index cea34413f..d5a2d925e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -17,7 +17,7 @@ # # python3 convert_hf_to_gguf_update.py # -# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated +# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp @@ -31,7 +31,6 @@ import re import requests import sys import json -import shutil from hashlib import sha256 from enum import IntEnum, auto @@ -65,50 +64,36 @@ else: # TODO: add models here, base models preferred models = [ - {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, - {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, - {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, - {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, - {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, - {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, - {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, - {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, - {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, - {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, - {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, - {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, - {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, - {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, - {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, - {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, - {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, - {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, - {"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", }, - {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! - {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, - {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, - {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, - {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, - {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, - {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B - {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, - {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, - {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, - {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, - {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, - {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, - {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, - {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, - {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, - {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, - {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, - {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, - {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, - {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, - {"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"}, - {"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"}, - {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, - {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"}, + {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, + {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, + {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, + {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, + {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, + {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, + {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, + {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, + {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, + {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, + {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, + {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, + {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, + {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, + {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! + {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, + {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, + {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, + {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, + {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B + {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, + {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, + {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, + {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, + {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, + {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, + {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, ] @@ -137,27 +122,12 @@ def download_model(model): if tokt == TOKENIZER_TYPE.UGM: files.append("spiece.model") - if os.path.isdir(repo): - # If repo is a path on the file system, copy the directory - for file in files: - src_path = os.path.join(repo, file) - dst_path = f"models/tokenizers/{name}/{file}" - if os.path.isfile(dst_path): - logger.info(f"{name}: File {dst_path} already exists - skipping") - continue - if os.path.isfile(src_path): - shutil.copy2(src_path, dst_path) - logger.info(f"{name}: Copied {src_path} to {dst_path}") - else: - logger.warning(f"{name}: Source file {src_path} does not exist") - else: - # If repo is a URL, download the files - for file in files: - save_path = f"models/tokenizers/{name}/{file}" - if os.path.isfile(save_path): - logger.info(f"{name}: File {save_path} already exists - skipping") - continue - download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) + for file in files: + save_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(save_path): + logger.info(f"{name}: File {save_path} already exists - skipping") + continue + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) for model in models: diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 29b14e98d..7b00b4398 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -116,7 +116,7 @@ class Tensor: assert quant is not None, 'Unknown tensor type' (blksize, tysize) = quant offset += 12 - self.dtype= gguf.GGMLQuantizationType(dtype) + self.dtype= dtype self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) offset += 4 * n_dims self.name = bytes(data[offset:offset + name_len]) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 6dea14a23..a88d0d4a9 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -12,7 +12,6 @@ import json from math import prod from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast -from transformers import AutoConfig import torch @@ -226,15 +225,12 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") - # models produced by mergekit-extract-lora have token embeddings in the adapter - base_name = base_name.replace(".lora_embedding_A", ".weight") - base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file") + description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", @@ -260,27 +256,17 @@ def parse_args() -> argparse.Namespace: help="only print out what will be done, without writing any new files", ) parser.add_argument( - "--base", type=Path, - help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", - ) - parser.add_argument( - "--base-model-id", type=str, - help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", + "--base", type=Path, required=True, + help="directory containing base model file", ) parser.add_argument( "lora_path", type=Path, - help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", + help="directory containing LoRA adapter file", ) return parser.parse_args() -def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: - # normally, adapter does not come with base model config, we need to load it from AutoConfig - config = AutoConfig.from_pretrained(hf_model_id) - return config.to_dict() - - if __name__ == '__main__': args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) @@ -295,9 +281,8 @@ if __name__ == '__main__': ftype = ftype_map[args.outtype] - dir_base_model: Path | None = args.base + dir_base_model: Path = args.base dir_lora: Path = args.lora_path - base_model_id: str | None = args.base_model_id lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" @@ -316,32 +301,9 @@ if __name__ == '__main__': input_model = os.path.join(dir_lora, "adapter_model.bin") lora_model = torch.load(input_model, map_location="cpu", weights_only=True) - # load LoRA config - with open(lora_config, "r") as f: - lparams: dict[str, Any] = json.load(f) - # load base model - if base_model_id is not None: - logger.info(f"Loading base model from Hugging Face: {base_model_id}") - hparams = load_hparams_from_hf(base_model_id) - elif dir_base_model is None: - if "base_model_name_or_path" in lparams: - model_id = lparams["base_model_name_or_path"] - logger.info(f"Loading base model from Hugging Face: {model_id}") - try: - hparams = load_hparams_from_hf(model_id) - except OSError as e: - logger.error(f"Failed to load base model config: {e}") - logger.error("Please try downloading the base model and add its path to --base") - sys.exit(1) - else: - logger.error("'base_model_name_or_path' is not found in adapter_config.json") - logger.error("Base model config is required. Please download the base model and add its path to --base") - sys.exit(1) - else: - logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) - + logger.info(f"Loading base model: {dir_base_model.name}") + hparams = Model.load_hparams(dir_base_model) with torch.inference_mode(): try: model_class = Model.from_model_architecture(hparams["architectures"][0]) @@ -361,19 +323,13 @@ if __name__ == '__main__': self.dir_model_card = dir_lora_model self.lora_alpha = float(lora_alpha) - def set_vocab(self): - pass - def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") def set_gguf_parameters(self): self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # Never add extra tensors (e.g. rope_freqs) for LoRA adapters - return () + super().set_gguf_parameters() def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -382,20 +338,12 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - # note: mergekit-extract-lora also adds token embeddings to the adapter - is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name - is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name + is_lora_a = ".lora_A.weight" in name + is_lora_b = ".lora_B.weight" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue - # mergekit-extract-lora add these layernorm to the adapter, we need to keep them - if "_layernorm" in name or ".norm" in name: - yield (base_name, tensor) - continue logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") - if ".embed_tokens.weight" in name or ".lm_head.weight" in name: - logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") - logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") sys.exit(1) if base_name in tensor_map: @@ -415,32 +363,17 @@ if __name__ == '__main__': yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - dest = list(super().modify_tensors(data_torch, name, bid)) - # some archs may have the same tensor for lm_head and output (tie word embeddings) - # in this case, adapters targeting lm_head will fail when using llama-export-lora - # therefore, we ignore them for now - # see: https://github.com/ggerganov/llama.cpp/issues/9065 - if name == "lm_head.weight" and len(dest) == 0: - raise ValueError("lm_head is present in adapter, but is ignored in base model") + dest = super().modify_tensors(data_torch, name, bid) for dest_name, dest_data in dest: - # mergekit-extract-lora add these layernorm to the adapter - if "_norm" in dest_name: - assert dest_data.dim() == 1 - yield (dest_name, dest_data) - continue - - # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() - # note: mergekit-extract-lora flip and transpose A and B - # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() - if "token_embd.weight" in dest_name: - lora_a = lora_a.T - yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + alpha: float = lparams["lora_alpha"] model_instance = LoraModel( @@ -453,7 +386,6 @@ if __name__ == '__main__': dry_run=args.dry_run, dir_lora_model=dir_lora, lora_alpha=alpha, - hparams=hparams, ) logger.info("Exporting model...") diff --git a/docs/android.md b/docs/android.md index 47530c6c1..cec4358d9 100644 --- a/docs/android.md +++ b/docs/android.md @@ -2,82 +2,55 @@ # Android ## Build on Android using Termux - -[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid. - -With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell: - +[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required). ``` -$ apt update && apt upgrade -y -$ apt install git cmake +apt update && apt upgrade -y +apt install git make cmake ``` -Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake. - -Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance: - +It's recommended to move your model inside the `~/` directory for best performance: ``` -$ curl -L {model-url} -o ~/{model}.gguf +cd storage/downloads +mv model.gguf ~/ ``` -Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: +[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`. +## Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: ``` -$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" +$ mkdir build-android +$ cd build-android +$ export NDK= +$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +$ make ``` -Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). -To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml +``` + +Here's a demo of an interactive session running on Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 - -## Cross-compile using Android NDK -It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.) - -Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory: - -``` -$ cmake \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-28 \ - -DCMAKE_C_FLAGS="-march=armv8.7a" \ - -DCMAKE_CXX_FLAGS="-march=armv8.7a" \ - -DGGML_OPENMP=OFF \ - -DGGML_LLAMAFILE=OFF \ - -B build-android -``` - -Notes: - - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time - - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325) - -The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use. - -Feel free to adjust the Android ABI for your target. Once the project is configured: - -``` -$ cmake --build build-android --config Release -j{n} -$ cmake --install build-android --prefix {install-dir} --config Release -``` - -After installing, go ahead and download the model of your choice to your host system. Then: - -``` -$ adb shell "mkdir /data/local/tmp/llama.cpp" -$ adb push {install-dir} /data/local/tmp/llama.cpp/ -$ adb push {model}.gguf /data/local/tmp/llama.cpp/ -$ adb shell -``` - -In the `adb shell`: - -``` -$ cd /data/local/tmp/llama.cpp -$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}" -``` - -That's it! - -Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`. diff --git a/docs/backend/BLIS.md b/docs/backend/BLIS.md index 904548577..35d06bd0f 100644 --- a/docs/backend/BLIS.md +++ b/docs/backend/BLIS.md @@ -27,6 +27,13 @@ We recommend using openmp since it's easier to modify the cores being used. ### llama.cpp compilation +Makefile: + +```bash +make GGML_BLIS=1 -j +# make GGML_BLIS=1 llama-benchmark-matmult +``` + CMake: ```bash diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md deleted file mode 100644 index 23f10175a..000000000 --- a/docs/backend/CANN.md +++ /dev/null @@ -1,263 +0,0 @@ -# llama.cpp for CANN - - - [Background](#background) - - [News](#news) - - [OS](#os) - - [Hardware](#hardware) - - [Model Supports](#model-supports) - - [DataType Supports](#datatype-supports) - - [Docker](#docker) - - [Linux](#linux) - - [TODO](#todo) - - -## Background - -**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars. - -**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform. - -**Llama.cpp + CANN** - -The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly. - -## News - -- 2024.11 - - Support F16 and F32 data type model for Ascend 310P NPU. -- 2024.8 - - Support `Q4_0` and `Q8_0` data type for Ascend NPU. -- 2024.7 - - Create CANN backend for Ascend NPU. - -## OS - -| OS | Status | Verified | -|:-------:|:-------:|:----------------------------------------------:| -| Linux | Support | Ubuntu 22.04, OpenEuler22.03 | - - -## Hardware - -### Ascend NPU - -**Verified devices** - -| Ascend NPU | Status | -|:-----------------------------:|:-------:| -| Atlas 300T A2 | Support | -| Atlas 300I Duo | Support | - -*Notes:* - -- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag. -- If you run successfully with your Ascend NPU device, please help update the upper table. - - -## Model Supports - -| Model Name | FP16 | Q8_0 | Q4_0 | -|:----------------------------|:-----:|:----:|:----:| -| AquilaChat2-7B | √ | √ | √ | -| Baichuan-7b | √ | √ | √ | -| Baichuan2-7B-Chat | √ | √ | √ | -| bitnet_b1_58-large | √ | √ | √ | -| bloom-560m | √ | x | √ | -| bloomz-alpaca-560m | √ | x | √ | -| c4ai-command-r-35B-v01 | x | x | x | -| chatglm3-6B | x | x | x | -| chinese-alpaca-2-1.3b | √ | √ | √ | -| CodeShell-7B | √ | √ | √ | -| deepseek-ai_deepseek-coder-1.3B-base | x | x | x | -| deepseek-ai_DeepSeek-V2-Lite | x | x | x | -| deepseek-coder-6.7B-instruct | x | x | x | -| DeepSeek-V2-Lite-64x1.5B | x | x | x | -| falcon-7b-instruct | √ | √ | √ | -| flan-t5-large | √ | √ | √ | -| gemma-2-9b-it | √ | √ | √ | -| glm-4-9B | x | x | x | -| gpt2 | √ | √ | √ | -| Gpt2-163M | √ | √ | √ | -| granite-3B-code-instruct | √ | √ | √ | -| GritLM-7B | √ | √ | √ | -| internlm2_5-7b-chat | √ | √ | √ | -| koala-7B-HF | √ | √ | √ | -| Llama-2-7b-chat-hf | √ | √ | √ | -| Llama-3-Smaug-8B | √ | √ | √ | -| Llama2-Chinese-7b-Chat | √ | √ | √ | -| Llama3-8B | √ | √ | √ | -| Llama3-8b-chinese | √ | √ | √ | -| mamba-130m-hf | √ | √ | √ | -| Mistral-7B-Instruct-v0.2 | √ | √ | √ | -| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ | -| mpt-7B | √ | √ | √ | -| OLMo-1B-hf | √ | √ | √ | -| OpenELM-3B-Instruct | √ | √ | √ | -| Orion-14b-base | √ | √ | √ | -| phi1 | x | x | x | -| phi2 | x | x | x | -| Phi-3-mini-4k-instruct | √ | √ | √ | -| plamo-13b | √ | √ | √ | -| pythia-70M | x | x | x | -| Qwen-7B | √ | √ | √ | -| Qwen2-1.5B-Instruct | √ | x | √ | -| Refact-1_6B-fim | √ | √ | √ | -| SmolLM-135M | √ | √ | √ | -| stablelm-zephyr | x | x | x | -| stablelm-2-zephyr-1_6b | x | x | x | -| starcoderbase-1b | √ | √ | √ | -| starcoder2-3b | √ | √ | √ | -| vigogne-7b-chat | √ | √ | √ | -| xverse-7b-chat | √ | √ | √ | -| Yi-6b-Chat | √ | √ | √ | - - - -## DataType Supports - -| DataType | Status | -|:----------------------:|:-------:| -| FP16 | Support | -| Q8_0 | Support | -| Q4_0 | Support | - -## Docker - -### Build Images -You can get a image with llama.cpp in one command. -```sh -docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile . -``` - -### Run container - -```sh -# Find all cards. -npu-smi info - -# Select the cards that you want to use, make sure these cards are not used by someone. -# Following using cards of device0. -docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:" -``` - -*Notes:* - -- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*. - -## Linux - -### I. Setup Environment - -1. **Install Ascend Driver and firmware** - - ```sh - # create driver running user. - sudo groupadd -g HwHiAiUser - sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash - sudo usermod -aG HwHiAiUser $USER - - # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system - # and install driver. - sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all - ``` - - Once installed, run `npu-smi info` to check whether driver is installed successfully. - ```sh - +-------------------------------------------------------------------------------------------+ - | npu-smi 24.1.rc2 Version: 24.1.rc2 | - +----------------------+---------------+----------------------------------------------------+ - | NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)| - | Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) | - +======================+===============+====================================================+ - | 2 xxx | OK | 64.4 51 15 / 15 | - | 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 | - +======================+===============+====================================================+ - | 5 xxx | OK | 64.0 52 15 / 15 | - | 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 | - +======================+===============+====================================================+ - | No running processes found in NPU 2 | - +======================+===============+====================================================+ - | No running processes found in NPU 5 | - +======================+===============+====================================================+ - ``` - -2. **Install Ascend Firmware** - ```sh - # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system - # and install driver. - sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full - ``` - If the following messaage appers, firmware is installed successfully. - ```sh - Firmware package installed successfully! - ``` - - -3. **Install CANN toolkit and kernels** - - CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page. - - Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command. - ```sh - pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions - sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install - sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install - ``` - - Set Ascend Variables: - ```sh - echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc - source ~/.bashrc - ``` - -Upon a successful installation, CANN is enabled for the available ascend devices. - -### II. Build llama.cpp - -```sh -cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release -cmake --build build --config release -``` - -### III. Run the inference - -1. **Retrieve and prepare model** - - You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration. - - **Notes**: - - - CANN backend only supports FP16/Q4_0/Q8_0 models currently. - -2. **Launch inference** - - There are two device selection modes: - - - Single device: Use one device target specified by the user. - - Multiple devices: Automatically choose the devices with the same backend. - - | Device selection | Parameter | - |:----------------:|:--------------------------------------:| - | Single device | --split-mode none --main-gpu DEVICE_ID | - | Multiple devices | --split-mode layer (default) | - - Examples: - - - Use device 0: - - ```sh - ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 - ``` - - - Use multiple devices: - - ```sh - ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer - ``` - -### **GitHub contribution**: -Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay. - - -## TODO -- Support more models and data types. diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 89ddbd669..885983e92 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -20,13 +20,17 @@ **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include: - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers. -- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*. +- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*. - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs. - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets. ### Llama.cpp + SYCL -The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD. +The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*). + +When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend. + +It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose. ## Recommended Release @@ -34,20 +38,13 @@ The SYCL backend would be broken by some PRs due to no online CI. The following release is verified with good quality: -|Commit ID|Tag|Release|Verified Platform| Update date| -|-|-|-|-|-| -|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| +|Commit ID|Tag|Release|Verified Platform| +|-|-|-|-| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| ## News -- 2024.11 - - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer. - -- 2024.8 - - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs. - - 2024.5 - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770. - Arch Linux is verified successfully. @@ -83,14 +80,7 @@ The following release is verified with good quality: ### Intel GPU -SYCL backend supports Intel GPU Family: - -- Intel Data Center Max Series -- Intel Flex Series, Arc Series -- Intel Built-in Arc GPU -- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). - -#### Verified devices +**Verified devices** | Intel GPU | Status | Verified Model | |-------------------------------|---------|---------------------------------------| @@ -98,7 +88,7 @@ SYCL backend supports Intel GPU Family: | Intel Data Center Flex Series | Support | Flex 170 | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | -| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 | +| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | *Notes:* @@ -114,18 +104,10 @@ SYCL backend supports Intel GPU Family: **Verified devices** -| Nvidia GPU | Status | Verified Model | -|--------------------------|-----------|----------------| -| Ampere Series | Supported | A100, A4000 | -| Ampere Series *(Mobile)* | Supported | RTX 40 Series | - -| AMD GPU | Status | Verified Model | -|--------------------------|--------------|----------------| -| Radeon Pro | Experimental | W6800 | -| Radeon RX | Experimental | 6700 XT | - -Note: AMD GPU support is highly experimental and is incompatible with F16. -Additionally, it only supports GPUs with a sub_group_size (warp size) of 32. +| Nvidia GPU | Status | Verified Model | +|--------------------------|---------|----------------| +| Ampere Series | Support | A100, A4000 | +| Ampere Series *(Mobile)* | Support | RTX 40 Series | ## Docker The docker build option is currently limited to *intel GPU* targets. @@ -133,7 +115,7 @@ The docker build option is currently limited to *intel GPU* targets. ### Build image ```sh # Using FP16 -docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile . +docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . ``` *Notes*: @@ -197,10 +179,6 @@ Platform #0: Intel(R) OpenCL HD Graphics In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed. -- **AMD GPU** - -To target AMD GPUs with SYCL, the ROCm stack must be installed first. - 2. **Install Intel® oneAPI Base toolkit** - **For Intel GPU** @@ -211,7 +189,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable. -Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs. +Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs. - **Adding support to Nvidia GPUs** @@ -227,19 +205,6 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB cmake --build buildWithCublas --config Release ``` -- **Adding support to AMD GPUs** - -**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit. - -**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs. - -```sh -git clone https://github.com/oneapi-src/oneMKL -cd oneMKL -# Find your HIPTARGET with rocminfo, under the key 'Name:' -cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas -cmake --build buildWithrocBLAS --config Release -``` 3. **Verify installation and environment** @@ -251,48 +216,33 @@ sycl-ls - **Intel GPU** -When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below: +When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below: ``` -[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] -[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] -[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] -[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] ``` - **Nvidia GPU** -Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below: - +Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: ``` -[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] -[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] -[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5] -``` - -- **AMD GPU** - -For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]: - -``` -[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000] -[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9] +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] +[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] +[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2] ``` ### II. Build llama.cpp #### Intel GPU - -``` -./examples/sycl/build.sh -``` - -or - ```sh # Export relevant ENV variables source /opt/intel/oneapi/setvars.sh +# Build LLAMA with MKL BLAS acceleration for intel GPU + # Option 1: Use FP32 (recommended for better performance in most cases) cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx @@ -304,7 +254,6 @@ cmake --build build --config Release -j -v ``` #### Nvidia GPU - ```sh # Export relevant ENV variables export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH @@ -313,106 +262,62 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR # Build LLAMA with Nvidia BLAS acceleration through SYCL -# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance -GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v -``` -#### AMD GPU - -```sh -# Export relevant ENV variables -export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH -export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH -export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR - -# Build LLAMA with rocBLAS acceleration through SYCL - -## AMD -# Use FP32, FP16 is not supported -# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:' -GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx - -# build all binary -cmake --build build --config Release -j -v ``` ### III. Run the inference -#### Retrieve and prepare model +1. Retrieve and prepare model You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. -##### Check device - -1. Enable oneAPI running environment +2. Enable oneAPI running environment ```sh source /opt/intel/oneapi/setvars.sh ``` -2. List devices information +3. List devices information Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ```sh ./build/bin/llama-ls-sycl-device ``` - -This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: +A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: ``` -found 2 SYCL devices: - +found 6 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| +| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| +| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| +| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -#### Choose level-zero devices +| Attribute | Note | +|------------------------|-------------------------------------------------------------| +| compute capability 1.3 | Level-zero driver/runtime, recommended | +| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | -|Chosen Device ID|Setting| -|-|-| -|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action| -|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`| -|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| - -#### Execute - -Choose one of following methods to run. - -1. Script - -- Use device 0: - -```sh -./examples/sycl/run-llama2.sh 0 -``` -- Use multiple devices: - -```sh -./examples/sycl/run-llama2.sh -``` - -2. Command line -Launch inference +4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. Default device id is 0. -- Multiple devices: Automatically choose the devices with the same backend. - -In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. +- Single device: Use one device target specified by the user. +- Multiple devices: Automatically select the devices with the same largest Max compute-units. | Device selection | Parameter | |------------------|----------------------------------------| @@ -426,6 +331,11 @@ Examples: ```sh ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ``` +or run by script: + +```sh +./examples/sycl/run_llama2.sh 0 +``` - Use multiple devices: @@ -433,6 +343,12 @@ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Bui ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` +Otherwise, you can run the script: + +```sh +./examples/sycl/run_llama2.sh +``` + *Notes:* - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: @@ -479,7 +395,7 @@ c. Verify installation In the oneAPI command line, run the following to print the available SYCL devices: ``` -sycl-ls.exe +sycl-ls ``` There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device: @@ -500,18 +416,6 @@ b. The new Visual Studio will install Ninja as default. (If not, please install ### II. Build llama.cpp -You could download the release package for Windows directly, which including binary files and depended oneAPI dll files. - -Choose one of following methods to build from source code. - -1. Script - -```sh -.\examples\sycl\win-build-sycl.bat -``` - -2. CMake - On the oneAPI command line window, step into the llama.cpp main directory and run the following: ``` @@ -526,8 +430,12 @@ cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPI cmake --build build --config Release -j ``` -Or, use CMake presets to build: +Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: +```sh +.\examples\sycl\win-build-sycl.bat +``` +Or, use CMake presets to build: ```sh cmake --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli @@ -539,9 +447,7 @@ cmake --preset x64-windows-sycl-debug cmake --build build-x64-windows-sycl-debug -j --target llama-cli ``` -3. Visual Studio - -You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. +Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. *Notes:* @@ -549,65 +455,52 @@ You can use Visual Studio to open llama.cpp folder as a CMake project. Choose th ### III. Run the inference -#### Retrieve and prepare model +1. Retrieve and prepare model -You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. +You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. -##### Check device - -1. Enable oneAPI running environment +2. Enable oneAPI running environment On the oneAPI command line window, run the following and step into the llama.cpp directory: ``` "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 ``` -2. List devices information +3. List devices information Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ``` -build\bin\llama-ls-sycl-device.exe +build\bin\ls-sycl-device.exe ``` -This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: +The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: ``` -found 2 SYCL devices: +found 6 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| +| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| +| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| +| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -#### Choose level-zero devices -|Chosen Device ID|Setting| -|-|-| -|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action| -|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`| -|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| +| Attribute | Note | +|------------------------|-----------------------------------------------------------| +| compute capability 1.3 | Level-zero running time, recommended | +| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | -#### Execute -Choose one of following methods to run. - -1. Script - -``` -examples\sycl\win-run-llama2.bat -``` - -2. Command line - -Launch inference +4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. Default device id is 0. -- Multiple devices: Automatically choose the devices with the same backend. - -In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. +- Single device: Use one device assigned by user. +- Multiple devices: Automatically choose the devices with the same biggest Max compute units. | Device selection | Parameter | |------------------|----------------------------------------| @@ -627,7 +520,11 @@ build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website ca ``` build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` +Otherwise, run the following wrapper script: +``` +.\examples\sycl\win-run-llama2.bat +``` Note: @@ -641,19 +538,17 @@ Or use 1 SYCL GPUs: [0] with Max compute units:512 ``` - ## Environment Variable #### Build -| Name | Value | Function | -|--------------------|---------------------------------------|---------------------------------------------| -| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| -| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | -| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | -| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | -| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | -| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | +| Name | Value | Function | +|--------------------|-----------------------------------|---------------------------------------------| +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | +| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | +| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | #### Runtime @@ -689,26 +584,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 ``` Otherwise, please double-check the GPU driver installation steps. -- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend? - - No. We can't support Ollama issue directly, because we aren't familiar with Ollama. - - Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it. - - It's same for other projects including llama.cpp SYCL backend. - -- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer` - - Device Memory is not enough. - - |Reason|Solution| - |-|-| - |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.| - |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;
Use more than one devices to load model.| - ### **GitHub contribution**: Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. ## TODO -- NA +- Support row layer split for multiple card runs. diff --git a/docs/build.md b/docs/build.md index afb7a0402..916fcf22d 100644 --- a/docs/build.md +++ b/docs/build.md @@ -7,75 +7,113 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp ``` -The following sections describe how to build with different backends and options. +In order to build llama.cpp you have four different options. -## CPU Build +- Using `make`: + - On Linux or MacOS: -Build llama.cpp using `CMake`: + ```bash + make + ``` -```bash -cmake -B build -cmake --build build --config Release -``` + - On Windows: -**Notes**: + 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). + 2. Extract `w64devkit` on your pc. + 3. Run `w64devkit.exe`. + 4. Use the `cd` command to reach the `llama.cpp` folder. + 5. From here you can run: + ```bash + make + ``` -- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. -- For faster repeated compilation, install [ccache](https://ccache.dev/) -- For debug builds, there are two cases: + - Notes: + - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`. + - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel. + - For faster repeated compilation, install [ccache](https://ccache.dev/). + - For debug builds, run `make LLAMA_DEBUG=1` - 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag): +- Using `CMake`: - ```bash - cmake -B build -DCMAKE_BUILD_TYPE=Debug - cmake --build build - ``` - - 2. Multi-config generators (`-G` param set to Visual Studio, XCode...): - - ```bash - cmake -B build -G "Xcode" - cmake --build build --config Debug - ``` - - For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html). -- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: - ``` - cmake -B build -DBUILD_SHARED_LIBS=OFF + ```bash + cmake -B build cmake --build build --config Release ``` -- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: - - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): - - Tab Workload: Desktop-development with C++ - - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang) - - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test - - For Windows on ARM (arm64, WoA) build with: - ```bash - cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF - cmake --build build-arm64-windows-llvm-release - ``` - Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. + **Notes**: + + - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. + - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. + - For faster repeated compilation, install [ccache](https://ccache.dev/). + - For debug builds, there are two cases: + + 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag): - For building with ninja generator and clang compiler as default: - -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64 ```bash - cmake --preset x64-windows-llvm-release - cmake --build build-x64-windows-llvm-release + cmake -B build -DCMAKE_BUILD_TYPE=Debug + cmake --build build ``` + 2. Multi-config generators (`-G` param set to Visual Studio, XCode...): + + ```bash + cmake -B build -G "Xcode" + cmake --build build --config Debug + ``` + +- Using `gmake` (FreeBSD): + + 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics) + 2. Add your user to **video** group + 3. Install compilation dependencies. + + ```bash + sudo pkg install gmake automake autoconf pkgconf llvm15 openblas + + gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 + ``` + +## Metal Build + +On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. +To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option. + +When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line +argument. + ## BLAS Build -Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use: +Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use: -### Accelerate Framework +### Accelerate Framework: This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions. -### OpenBLAS +### OpenBLAS: This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine. +- Using `make`: + - On Linux: + ```bash + make GGML_OPENBLAS=1 + ``` + + - On Windows: + + 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). + 2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases). + 3. Extract `w64devkit` on your pc. + 4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`. + 5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`. + 6. Run `w64devkit.exe`. + 7. Use the `cd` command to reach the `llama.cpp` folder. + 8. From here you can run: + + ```bash + make GGML_OPENBLAS=1 + ``` + - Using `CMake` on Linux: ```bash @@ -87,6 +125,14 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i Check [BLIS.md](./backend/BLIS.md) for more information. +### SYCL + +SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. + +llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). + +For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md). + ### Intel oneMKL Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md). @@ -104,124 +150,51 @@ Building through oneAPI compilers will make avx_vnni instruction set available f Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. -### Other BLAS libraries +### CUDA -Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors. +This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). -## Metal Build - -On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. -To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option. - -When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument. - -## SYCL - -SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. - -llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). - -For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md). - -## CUDA - -This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed. - -#### Download directly from NVIDIA -You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads). - - -#### Compile and run inside a Fedora Toolbox Container -We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/). - -**Recommended for:** - -- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/). -- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde). -- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download) - - -### Compilation -```bash -cmake -B build -DGGML_CUDA=ON -cmake --build build --config Release -``` - -### Override Compute Capability Specifications - -If `nvcc` cannot detect your gpu, you may get compile-warnings such as: - ```text -nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used -``` - -To override the `native` GPU detection: - -#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus). - -```text -GeForce RTX 4090 8.9 -GeForce RTX 3080 Ti 8.6 -GeForce RTX 3070 8.6 -``` - -#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list. - -```bash -cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89" -``` - -### Runtime CUDA environmental variables - -You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime. - -```bash -# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device. -CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf -``` - -### Unified Memory - -The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. - -### Performance Tuning - -The following compilation options are also available to tweak performance: - -| Option | Legal values | Default | Description | -|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | -| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | -| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | -| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | -| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | - -## MUSA - -This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa). +For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. +- Using `make`: + ```bash + make GGML_CUDA=1 + ``` - Using `CMake`: ```bash - cmake -B build -DGGML_MUSA=ON + cmake -B build -DGGML_CUDA=ON cmake --build build --config Release ``` -The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used. +The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: -The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. +| Option | Legal values | Default | Description | +|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | +| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | +| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | +| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | +| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | +| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | +| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | +| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | +| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | -Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet. +### hipBLAS -## HIP - -This provides GPU acceleration on HIP-supported AMD GPUs. +This provides BLAS acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). +- Using `make`: + ```bash + make GGML_HIPBLAS=1 + ``` - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. @@ -238,14 +211,19 @@ You can download it from your Linux distro's package manager or from here: [ROCm ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIP_DEVICE_LIB_PATH= \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build -- -j 16 ``` +- Using `make` (example for target gfx1030, build with 16 CPU threads): + ```bash + make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 + ``` + - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% - cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake --build build ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) @@ -254,16 +232,23 @@ You can download it from your Linux distro's package manager or from here: [ROCm The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. +The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): -## Vulkan +| Option | Legal values | Default | Description | +|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | +| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | +| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + +### Vulkan **Windows** -### w64devkit +#### w64devkit -Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases). +Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases). -Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. +Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required. Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies: ```sh @@ -279,47 +264,18 @@ Libs: -lvulkan-1 EOF ``` +Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`. -Switch into the `llama.cpp` directory and build using CMake. -```sh -cmake -B build -DGGML_VULKAN=ON -cmake --build build --config Release -``` - -### Git Bash MINGW64 - -Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings - -Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++` - -Download and install [`CMake`](https://cmake.org/download/) with the default settings - -Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. - -Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands - -``` -cmake -B build -DGGML_VULKAN=ON -cmake --build build --config Release -``` - -Now you can load the model in conversation mode using `Vulkan` - -```sh -build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv -``` - -### MSYS2 +#### MSYS2 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies. -```sh -pacman -S git \ - mingw-w64-ucrt-x86_64-gcc \ - mingw-w64-ucrt-x86_64-cmake \ - mingw-w64-ucrt-x86_64-vulkan-devel \ - mingw-w64-ucrt-x86_64-shaderc -``` - -Switch into the `llama.cpp` directory and build using CMake. + ```sh + pacman -S git \ + mingw-w64-ucrt-x86_64-gcc \ + mingw-w64-ucrt-x86_64-cmake \ + mingw-w64-ucrt-x86_64-vulkan-devel \ + mingw-w64-ucrt-x86_64-shaderc + ``` +Switch into `llama.cpp` directory and build using CMake. ```sh cmake -B build -DGGML_VULKAN=ON cmake --build build --config Release @@ -331,7 +287,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container. ```sh # Build the image -docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile . +docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile . # Then, use it: docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 @@ -368,41 +324,6 @@ cmake --build build --config Release # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 ``` -## CANN -This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU. - -For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/). - -Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann) - -Go to `llama.cpp` directory and build using CMake. -```bash -cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release -cmake --build build --config release -``` - -You can test with: - -```bash -./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32 -``` - -If the following info is output on screen, you are using `llama.cpp` with the CANN backend: -```bash -llm_load_tensors: CANN model buffer size = 13313.00 MiB -llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB -``` - -For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md). - -## Android +### Android To read documentation for how to build on Android, [click here](./android.md) - -## Notes about GPU-accelerated backends - -The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. - -In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option. - -Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building. diff --git a/docs/cuda-fedora.md b/docs/cuda-fedora.md deleted file mode 100644 index 9c88b7694..000000000 --- a/docs/cuda-fedora.md +++ /dev/null @@ -1,270 +0,0 @@ -# Setting Up CUDA on Fedora - -In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for: - -- [Fedora Workstation](https://fedoraproject.org/workstation/) -- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/) -- [Fedora Spins](https://fedoraproject.org/spins) -- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.5`, `Arch Linux`, and `Ubuntu`. - -## Table of Contents - -- [Prerequisites](#prerequisites) -- [Using the Fedora 41 CUDA Repository](#using-the-fedora-41-cuda-repository) -- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment) -- [Installing Essential Development Tools](#installing-essential-development-tools) -- [Adding the CUDA Repository](#adding-the-cuda-repository) -- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs) -- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts) -- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs) -- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package) -- [Configuring the Environment](#configuring-the-environment) -- [Verifying the Installation](#verifying-the-installation) -- [Conclusion](#conclusion) -- [Troubleshooting](#troubleshooting) -- [Additional Notes](#additional-notes) -- [References](#references) - -## Prerequisites - -- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/). -- **NVIDIA Drivers and Graphics Card installed on Host System (recommended)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA). -- **Internet connectivity** to download packages. - -### Using the Fedora 41 CUDA Repository - -The latest release is 41. - -- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/) - -**Note:** We recommend using a toolbox environment to prevent system conflicts. - -## Creating a Fedora Toolbox Environment - -This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using the Fedora Toolbox allows us to install the necessary packages without affecting the host system. - -**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker. - -1. **Create a Fedora 41 Toolbox:** - - ```bash - toolbox create --image registry.fedoraproject.org/fedora-toolbox:41 --container fedora-toolbox-41-cuda - ``` - -2. **Enter the Toolbox:** - - ```bash - toolbox enter --container fedora-toolbox-41-cuda - ``` - - Inside the toolbox, you have root privileges and can install packages without affecting the host system. - -## Installing Essential Development Tools - -1. **Synchronize the DNF Package Manager:** - - ```bash - sudo dnf distro-sync - ``` - -2. **Install the Default Text Editor (Optional):** - - ```bash - sudo dnf install vim-default-editor --allowerasing - ``` - - The `--allowerasing` flag will allow the removal of the conflicting `nano-default-editor` package. - -3. **Install Development Tools and Libraries:** - - ```bash - sudo dnf install @c-development @development-tools cmake - ``` - - This installs essential packages for compiling software, including `gcc`, `make`, and other development headers. - -## Adding the CUDA Repository - -Add the NVIDIA CUDA repository to your DNF configuration: - -```bash -sudo dnf config-manager addrepo --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/cuda-fedora41.repo -``` - -After adding the repository, synchronize the package manager again: - -```bash -sudo dnf distro-sync -``` - -## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs` - -We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go). - -```bash -ls -la /usr/lib64/libcuda.so.1 -``` - -**Explanation:** - -- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA, - on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries. - -### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found). - -```bash -sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs -``` - -### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found). - -If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files. - -#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies) - -```bash -sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs -``` - -#### 2. Update the RPM database to assume the installation of these packages. - -```bash -sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/* -``` - -**Note:** - -- The `--justdb` option only updates the RPM database, without touching the filesystem. - -#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs` - -After manually installing the dependencies, run: - -```bash -sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs -``` - -You should receive a message indicating the package is already installed: - -``` -Updating and loading repositories: -Repositories loaded. -Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed. -Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed. - -Nothing to do. -``` - -## Installing the CUDA Meta-Package - -Now that the driver libraries are installed, proceed to install CUDA: - -```bash -sudo dnf install cuda -``` - -This installs the CUDA toolkit and associated packages. - -## Configuring the Environment - -To use CUDA, add its binary directory to your system's `PATH`. - -1. **Create a Profile Script:** - - ```bash - sudo sh -c 'echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /etc/profile.d/cuda.sh' - ``` - - **Explanation:** - - - We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system. - - The backslash `\` before `$PATH` ensures the variable is correctly written into the script. - -2. **Make the Script Executable:** - - ```bash - sudo chmod +x /etc/profile.d/cuda.sh - ``` - -3. **Source the Script to Update Your Environment:** - - ```bash - source /etc/profile.d/cuda.sh - ``` - - **Note:** This command updates your current shell session with the new `PATH`. The `/etc/profile.d/cuda.sh` script ensures that the CUDA binaries are available in your `PATH` for all future sessions. - -## Verifying the Installation - -To confirm that CUDA is correctly installed and configured, check the version of the NVIDIA CUDA Compiler (`nvcc`): - -```bash -nvcc --version -``` - -You should see output similar to: - -``` -nvcc: NVIDIA (R) Cuda compiler driver -Copyright (c) 2005-2025 NVIDIA Corporation -Built on Wed_Jan_15_19:20:09_PST_2025 -Cuda compilation tools, release 12.8, V12.8.61 -Build cuda_12.8.r12.8/compiler.35404655_0 -``` - -This output confirms that the CUDA compiler is accessible and indicates the installed version. - -## Conclusion - -You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 41 CUDA repository. By manually updating the RPM db and configuring the environment, you can develop CUDA applications without affecting your host system. - -## Troubleshooting - -- **Installation Failures:** - - - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies. - - You may use the `--excludepath` option with `rpm` to exclude conflicting files during manual RPM installations. - -- **Rebooting the Container:** - - - Sometimes there may be a bug in the NVIDIA driver host passthrough (such as missing a shared library). Rebooting the container may solve this issue: - - ```bash - # on the host system - podman container restart --all - ``` - -- **Environment Variables Not Set:** - - If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`. - - Run `echo $PATH` to check if the path is included. - - Re-source the profile script or open a new terminal session. - -## Additional Notes - -- **Updating CUDA in the Future:** - - - Keep an eye on the official NVIDIA repositories for updates to your Fedora version. - - When an updated repository becomes available, adjust your `dnf` configuration accordingly. - -- **Building `llama.cpp`:** - - - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. - - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration. - -- **Using the Toolbox Environment:** - - The toolbox environment is isolated from your host system, which helps prevent conflicts. - - Remember that system files and configurations inside the toolbox are separate from the host. By default the home directory of the user is shared between the host and the toolbox. - ---- - -**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox. - -**Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide. - -## References - -- [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/) -- [NVIDIA CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) -- [Podman Documentation](https://podman.io/get-started) - ---- diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 8fcd70811..04c5ccbbe 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -28,7 +28,7 @@ The required steps to implement for an HF model are: ```python @Model.register("MyModelForCausalLM") class MyModel(Model): - model_arch = gguf.MODEL_ARCH.MYMODEL + model_arch = gguf.MODEL_ARCH.GROK ``` 2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py) @@ -79,14 +79,14 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi - `Model#set_vocab` - `Model#write_tensors` -NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights. +NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. ### 2. Define the model architecture in `llama.cpp` The model params and tensors layout must be defined in `llama.cpp`: 1. Define a new `llm_arch` 2. Define the tensors layout in `LLM_TENSOR_NAMES` -3. Add any non-standard metadata in `llm_load_hparams` +3. Add any non standard metadata in `llm_load_hparams` 4. Create the tensors for inference in `llm_load_tensors` 5. If the model has a RoPE operation, add the rope type in `llama_rope_type` @@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. -Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`. +Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. -Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR. +When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). diff --git a/docs/docker.md b/docs/docker.md index dac9a9ec1..d8922d77d 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -19,11 +19,8 @@ Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) -The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). +The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). ## Usage @@ -60,17 +57,17 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia ## Building Docker locally ```bash -docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile . -docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile . -docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile . +docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . +docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile . +docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. The defaults are: -- `CUDA_VERSION` set to `12.6.0` -- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures +- `CUDA_VERSION` set to `11.7.1` +- `CUDA_DOCKER_ARCH` set to `all` The resulting images, are essentially the same as the non-CUDA images: @@ -87,37 +84,3 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` - -## Docker With MUSA - -Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container. - -## Building Docker locally - -```bash -docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile . -docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile . -docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile . -``` - -You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture. - -The defaults are: - -- `MUSA_VERSION` set to `rc3.1.0` - -The resulting images, are essentially the same as the non-MUSA images: - -1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-musa`: This image only includes the main executable file. -3. `local/llama.cpp:server-musa`: This image only includes the server executable file. - -## Usage - -After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag. - -```bash -docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 -``` diff --git a/docs/llguidance.md b/docs/llguidance.md deleted file mode 100644 index 792d20704..000000000 --- a/docs/llguidance.md +++ /dev/null @@ -1,51 +0,0 @@ -# LLGuidance Support in llama.cpp - -[LLGuidance](https://github.com/guidance-ai/llguidance) is a library for constrained decoding (also called constrained sampling or structured outputs) for Large Language Models (LLMs). Initially developed as the backend for the [Guidance](https://github.com/guidance-ai/guidance) library, it can also be used independently. - -LLGuidance supports JSON Schemas and arbitrary context-free grammars (CFGs) written in a [variant](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md) of Lark syntax. It is [very fast](https://github.com/guidance-ai/jsonschemabench/tree/main/maskbench) and has [excellent](https://github.com/guidance-ai/llguidance/blob/main/docs/json_schema.md) JSON Schema coverage but requires the Rust compiler, which complicates the llama.cpp build process. - -## Building - -To enable LLGuidance support, build llama.cpp with the `LLAMA_LLGUIDANCE` option: - -```sh -cmake -B build -DLLAMA_LLGUIDANCE=ON -make -C build -j -``` - -This requires the Rust compiler and the `cargo` tool to be [installed](https://www.rust-lang.org/tools/install). - -## Interface - -There are no new command-line arguments or modifications to `common_params`. When enabled, grammars starting with `%llguidance` are passed to LLGuidance instead of the [current](../grammars/README.md) llama.cpp grammars. Additionally, JSON Schema requests (e.g., using the `-j` argument in `llama-cli`) are also passed to LLGuidance. - -For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format. - -## Performance - -Computing a "token mask" (i.e., the set of allowed tokens) for a llama3 tokenizer with 128k tokens takes, on average, 50μs of single-core CPU time for the [JSON Schema Bench](https://github.com/guidance-ai/jsonschemabench). The p99 time is 0.5ms, and the p100 time is 20ms. These results are due to the lexer/parser split and several [optimizations](https://github.com/guidance-ai/llguidance/blob/main/docs/optimizations.md). - -## JSON Schema - -LLGuidance adheres closely to the JSON Schema specification. For example: - -- `additionalProperties` defaults to `true`, unlike current grammars, though you can set `"additionalProperties": false` if needed. -- any whitespace is allowed. -- The definition order in the `"properties": {}` object is maintained, regardless of whether properties are required (current grammars always puts required properties first). - -Unsupported schemas result in an error message—no keywords are silently ignored. - -## Why Not Reuse GBNF Format? - -GBNF lacks the concept of a lexer. - -Most programming languages, including JSON, use a two-step process: a lexer (built with regular expressions) converts a byte stream into lexemes, which are then processed by a CFG parser. This approach is faster because lexers are cheaper to evaluate, and there is ~10x fewer lexemes than bytes. -LLM tokens often align with lexemes, so the parser is engaged in under 0.5% of tokens, with the lexer handling the rest. - -However, the user has to provide the distinction between lexemes and CFG symbols. In [Lark](https://github.com/lark-parser/lark), lexeme names are uppercase, while CFG symbols are lowercase. -The [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) can often take care of this automatically. -See [LLGuidance syntax docs](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#terminals-vs-rules) for more details. - -## Error Handling - -Errors are currently printed to `stderr`, and generation continues. Improved error handling may be added in the future. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 66cfab2c3..155743639 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -6,26 +6,23 @@ find_package(Threads REQUIRED) # ... -# flags - -llama_add_compile_flags() - # examples include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() + add_subdirectory(cvector-generator) + add_subdirectory(baby-llama) add_subdirectory(batched-bench) add_subdirectory(batched) + add_subdirectory(benchmark) + add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - - if (NOT WIN32) - # disabled on Windows because it uses internal functions not exported with LLAMA_API - add_subdirectory(gbnf-validator) - endif() - + add_subdirectory(export-lora) + add_subdirectory(finetune) + add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) add_subdirectory(gguf) @@ -33,41 +30,28 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) + add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) + add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) + if (GGML_RPC) + add_subdirectory(rpc) + endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) + add_subdirectory(server) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) endif() add_subdirectory(save-load-state) - add_subdirectory(run) add_subdirectory(simple) - add_subdirectory(simple-chat) add_subdirectory(speculative) - add_subdirectory(speculative-simple) add_subdirectory(tokenize) - add_subdirectory(tts) - add_subdirectory(gen-docs) - if (NOT GGML_BACKEND_DL) - # these examples use the backends directly and cannot be built with dynamic loading - add_subdirectory(convert-llama2c-to-ggml) - add_subdirectory(cvector-generator) - add_subdirectory(export-lora) - if (NOT WIN32) - # disabled on Windows because it uses internal functions not exported with LLAMA_API - add_subdirectory(quantize-stats) - endif() - add_subdirectory(llava) - if (GGML_RPC) - add_subdirectory(rpc) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) - endif() - endif() + add_subdirectory(train-text-from-scratch) endif() diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt new file mode 100644 index 000000000..71b82105c --- /dev/null +++ b/examples/baby-llama/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-baby-llama) +add_executable(${TARGET} baby-llama.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp new file mode 100644 index 000000000..4f6c3746a --- /dev/null +++ b/examples/baby-llama/baby-llama.cpp @@ -0,0 +1,1640 @@ +#include "ggml.h" +#include "train.h" + +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#ifdef LLAMA_DEFAULT_RMS_EPS +constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; +#else +constexpr float rms_norm_eps = 5e-6f; +#endif + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +static struct ggml_tensor * randomize_tensor( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax +) { + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + } + + return tensor; +} + +struct llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } +}; + +static uint32_t get_n_ff(const struct llama_hparams* hparams) { + const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; + return n_ff; +} + +struct llama_hparams_lora { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + uint32_t n_lora = 64; + + bool operator!=(const llama_hparams_lora & other) const { + return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0; + } +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct llama_layer_lora { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wqa; + struct ggml_tensor * wqb; + struct ggml_tensor * wka; + struct ggml_tensor * wkb; + struct ggml_tensor * wva; + struct ggml_tensor * wvb; + struct ggml_tensor * woa; + struct ggml_tensor * wob; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + + +struct llama_kv_cache { + struct ggml_context * ctx = NULL; + + struct ggml_tensor * k; + struct ggml_tensor * v; + + // llama_ctx_buffer buf; + + int n; // number of tokens currently in the cache +}; + +struct llama_model { + struct ggml_context * ctx = NULL; + + llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; +}; + +struct llama_model_lora { + struct ggml_context * ctx = NULL; + + llama_hparams_lora hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * outputa; + struct ggml_tensor * outputb; + + std::vector layers; +}; + +static void init_model(struct llama_model * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + const uint32_t n_ff = get_n_ff(&hparams); + + struct ggml_context * ctx = model->ctx; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); + model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab}); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + // std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); + + layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + } +} + + +static void init_model_lora(struct llama_model_lora * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_mult = hparams.n_mult; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_lora = hparams.n_lora; + + const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; + + struct ggml_context * ctx = model->ctx; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); + model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab}); + model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab}); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + // std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); + + layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + } +} + +static void set_param_model(struct llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +static void set_param_model_lora(struct llama_model_lora * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->outputa); + ggml_set_param(ctx, model->outputb); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wqa); + ggml_set_param(ctx, layer.wqb); + ggml_set_param(ctx, layer.wka); + ggml_set_param(ctx, layer.wkb); + ggml_set_param(ctx, layer.wva); + ggml_set_param(ctx, layer.wvb); + ggml_set_param(ctx, layer.woa); + ggml_set_param(ctx, layer.wob); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { + const auto & hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings , rnd); + randomize_tensor_normal(model->norm , rnd); + randomize_tensor_normal(model->output , rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + randomize_tensor_normal(layer.attention_norm, rnd); + + randomize_tensor_normal(layer.wq, rnd); + randomize_tensor_normal(layer.wk, rnd); + randomize_tensor_normal(layer.wv, rnd); + randomize_tensor_normal(layer.wo, rnd); + + randomize_tensor_normal(layer.ffn_norm, rnd); + + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); + } + + free_random_normal_distribution(rnd); +} + + +static void randomize_model_lora( + struct llama_model_lora * model, int seed, float mean, float std, float min, float max +) { + const auto & hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings, rnd); + randomize_tensor_normal(model->norm , rnd); + randomize_tensor_normal(model->outputa , rnd); + randomize_tensor_normal(model->outputb , rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + randomize_tensor_normal(layer.attention_norm, rnd); + + randomize_tensor_normal(layer.wqa, rnd); + randomize_tensor_normal(layer.wqb, rnd); + randomize_tensor_normal(layer.wka, rnd); + randomize_tensor_normal(layer.wkb, rnd); + randomize_tensor_normal(layer.wva, rnd); + randomize_tensor_normal(layer.wvb, rnd); + randomize_tensor_normal(layer.woa, rnd); + randomize_tensor_normal(layer.wob, rnd); + + randomize_tensor_normal(layer.ffn_norm, rnd); + + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); + } + + free_random_normal_distribution(rnd); +} + +static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { + const auto & hparams = model->hparams; + + const uint32_t n_ctx = hparams.n_ctx; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx*n_batch; + const int64_t n_elements = n_embd*n_mem; + + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + // struct ggml_init_params params; + // params.mem_size = cache.buf.size; + // params.mem_buffer = cache.buf.addr; + // params.no_alloc = false; + if (!cache->ctx) { + struct ggml_init_params params; + params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; + params.mem_buffer = NULL; + params.no_alloc = false; + + cache->ctx = ggml_init(params); + + if (!cache->ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + exit(1); + } + } + + cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); +} + +static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { + const auto & hparams = model->hparams; + + const uint32_t n_ctx = hparams.n_ctx; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx*n_batch; + const int64_t n_elements = n_embd*n_mem; + + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + // struct ggml_init_params params; + // params.mem_size = cache.buf.size; + // params.mem_buffer = cache.buf.addr; + // params.no_alloc = false; + if (!cache->ctx) { + struct ggml_init_params params; + params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; + params.mem_buffer = NULL; + params.no_alloc = false; + + cache->ctx = ggml_init(params); + + if (!cache->ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + } + + cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + + return true; +} + +static struct ggml_tensor * forward( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { + const int N = n_tokens; + + struct llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + + // inpL shape [n_embd,N,1,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Kcur shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [n_embd, N, 1, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] + // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + } + + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Q shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // K shape [n_embd/n_head, n_past + N, n_head, 1] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + // KQ shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // split cached V into n_head heads + //// V shape [n_past + N, n_embd/n_head, n_head, 1] + // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] + struct ggml_tensor * V = + ggml_view_3d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(vc), + n_ctx*ggml_element_size(vc)*n_embd/n_head, + il*n_ctx*ggml_element_size(vc)*n_embd); + + // KQV shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + + // cur = ffn_norm*cur + // cur shape [n_embd,N,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + } + + // tmp shape [n_ff,N,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + + // SILU activation + // cur shape [n_ff,N,1,1] + cur = ggml_silu(ctx0, cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul(ctx0, cur, tmp); + + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + } + + // cur shape [n_embd,N,1,1] + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + // inpL shape [n_embd,N,1,1] + inpL = cur; + } + + // norm + { + + // inpL shape [n_embd,N,1,1] + inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + + // inpL = norm*inpL + // inpL shape [n_embd,N,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +static struct ggml_tensor * forward_batch( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch +) { + const int N = n_tokens; + + struct llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [N, n_embd, n_batch, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wv, + cur), + n_embd, N, n_batch), + 1, 0, 2, 3)); + + assert_shape_3d(Vcur, N, n_embd, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] + // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_2d(ctx0, kc, + ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), + ggml_element_size(kc)*n_embd*n_ctx, + (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, + ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), + ggml_element_size(vc)*n_ctx*n_embd, + ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); + + assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); + assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); + } + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, n_past + N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_4d(ctx0, + ggml_view_3d(ctx0, + kc, + n_embd, + (n_past + N), + n_batch, + n_embd*ggml_element_size(kc), + n_ctx*n_embd*ggml_element_size(kc), + il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), + n_embd/n_head, n_head, n_past + N, n_batch), + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); + + // K * Q + // KQ shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); + assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); + + // split cached V into n_head heads + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] + struct ggml_tensor * V = + ggml_view_4d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, n_batch, + ggml_element_size(vc)*n_ctx, + ggml_element_size(vc)*n_ctx*n_embd/n_head, + ggml_element_size(vc)*n_ctx*n_embd, + il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); + assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); + + // KQV shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +static struct ggml_tensor * forward_lora( + struct llama_model_lora * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { + const int N = n_tokens; + + struct llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + + // inpL shape [n_embd,N,1,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Kcur shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * Qcur = ggml_rope(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wqa, + ggml_mul_mat(ctx0, + model->layers[il].wqb, + cur)), + n_embd/n_head, n_head, N), + KQ_pos, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wka, + ggml_mul_mat(ctx0, + model->layers[il].wkb, + cur)), + n_embd/n_head, n_head, N), + KQ_pos, n_rot, 0); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [n_embd, N, 1, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_transpose(ctx0, + ggml_reshape_2d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wva, + ggml_mul_mat(ctx0, + model->layers[il].wvb, + cur)), + n_embd, N))); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] + // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + } + + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Q shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // K shape [n_embd/n_head, n_past + N, n_head, 1] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + // KQ shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // split cached V into n_head heads + //// V shape [n_past + N, n_embd/n_head, n_head, 1] + // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] + struct ggml_tensor * V = + ggml_view_3d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(vc), + n_ctx*ggml_element_size(vc)*n_embd/n_head, + il*n_ctx*ggml_element_size(vc)*n_embd); + + // KQV shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].woa, + ggml_mul_mat(ctx0, + model->layers[il].wob, + cur)); + } + + // inpFF shape [n_embd,N,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + + // cur = ffn_norm*cur + // cur shape [n_embd,N,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + } + + // tmp shape [n_ff,N,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + + // SILU activation + // cur shape [n_ff,N,1,1] + cur = ggml_silu(ctx0, cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul(ctx0, cur, tmp); + + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + } + + // cur shape [n_embd,N,1,1] + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + // inpL shape [n_embd,N,1,1] + inpL = cur; + } + + // norm + { + + // inpL shape [n_embd,N,1,1] + inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + + // inpL = norm*inpL + // inpL shape [n_embd,N,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + //embeddings = inpL; + } + + + // lm_head + // inpL shape [n_vocab,N,1,1] + inpL = ggml_mul_mat(ctx0, + model->outputa, + ggml_mul_mat(ctx0, + model->outputb, + inpL)); + + // ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { + assert(ggml_is_matrix(logits)); + assert(ggml_is_matrix(probs)); + assert(ggml_is_vector(best_samples)); + assert(logits->ne[1] == best_samples->ne[0]); + assert(logits->ne[0] == probs->ne[0]); + assert(logits->ne[1] == probs->ne[1]); + for (int i = 0; i < logits->ne[1]; ++i) { + float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]); + ggml_set_i32_1d(best_samples, i, 0); + for (int k = 0; k < logits->ne[0]; ++k) { + float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); + if (logit > max_logit) { + max_logit = logit; + ggml_set_i32_1d(best_samples, i, k); + } + } + float psum = 0; + for (int k = 0; k < logits->ne[0]; ++k) { + float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); + float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit); + psum += p; + ggml_set_f32_1d(probs, i * probs->ne[0] + k, p); + } + for (int k = 0; k < logits->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum); + } + } +} + +static void sample_softmax_batch( + struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, + struct ggml_tensor * best_samples +) { + GGML_ASSERT(ggml_is_matrix(best_samples)); + GGML_ASSERT(ggml_is_3d(logits)); + GGML_ASSERT(ggml_is_3d(probs)); + int n_tokens = best_samples->ne[0]; + int n_batch = best_samples->ne[1]; + int n_vocab = logits->ne[0]; + GGML_ASSERT(n_tokens == logits->ne[1]); + GGML_ASSERT(n_batch == logits->ne[2]); + GGML_ASSERT(n_vocab == probs->ne[0]); + GGML_ASSERT(n_tokens == probs->ne[1]); + GGML_ASSERT(n_batch == probs->ne[2]); + + for (int k = 0; k < n_batch; ++k) { + struct ggml_tensor * best_samples_k = ggml_view_1d(ctx, + best_samples, + best_samples->ne[0], + k*best_samples->nb[1]); + struct ggml_tensor * logits_k = ggml_view_2d(ctx, + logits, + logits->ne[0], + logits->ne[1], + logits->nb[1], + k*logits->nb[2]); + struct ggml_tensor * probs_k = ggml_view_2d(ctx, + probs, + probs->ne[0], + probs->ne[1], + probs->nb[1], + k*probs->nb[2]); + sample_softmax(logits_k, probs_k, best_samples_k); + } +} + +static void print_row(struct ggml_tensor * probs, int i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + printf(" %.2f", p); + } + printf("\n"); +} + +static void print_matrix(struct ggml_tensor * probs) { + assert(ggml_is_matrix(probs)); + for (int i = 0; i < probs->ne[1]; ++i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); + printf(" %.2f", p); + } + printf("\n"); + } +} + +static void print_token(int token, int n_vocab) { + for (int k = 0; k < token; ++k) { + printf(" "); + } + printf("X"); + for (int k = token+1; k < n_vocab; ++k) { + printf(" "); + } + printf("\n"); +} + +static void print_tokens(struct ggml_tensor * tokens, int n_vocab) { + for (int i=0; ine[0]; ++i) { + int token = ggml_get_i32_1d(tokens, i); + print_token(token, n_vocab); + } +} + +static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = targets->ne[0]; + float randomness = 0.0f; + // ggml_set_zero(targets); + ggml_set_f32(targets, -1.0f); + ggml_set_i32_1d(tokens_input, 0, 0); + for (int i=1; i 1.0f) ? 1.0f : z; // clamp to [0..1] + int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1)); + ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f); + if (ine[0]; + int n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_tokens == targets->ne[1]); + GGML_ASSERT(n_batch == targets->ne[2]); + + for (int k=0; kne[0], + k*tokens_input->nb[1]); + struct ggml_tensor * targets_k = ggml_view_2d(ctx, + targets, + targets->ne[0], + targets->ne[1], + targets->nb[1], + k*targets->nb[2]); + get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k); + } +} + +static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = targets->ne[0]; + for (int i=0; i work_buffer; + + for (int ex=0; ex "" [extra-main-args] +# + +if [ $# -lt 2 ]; then + echo "Usage: ./base-translate.sh \"\" [extra-main-args]" + exit 1 +fi + +eargs="" +if [ $# -gt 2 ]; then + eargs="${@:3}" +fi + +ftmp="__llama.cpp_example_tmp__.txt" +trap "rm -f $ftmp" EXIT + +echo "Translate from English to French: + +=== + +sea otter, peppermint, plush girafe: + +sea otter => loutre de mer +peppermint => menthe poivrée +plush girafe => girafe peluche + +=== + +violin + +violin => violon + +=== + +phone, computer, mouse, keyboard: + +phone => téléphone +computer => ordinateur +mouse => souris +keyboard => clavier + +=== +" > $ftmp + +echo "$2 +" >> $ftmp + +model=$1 + +# generate the most likely continuation until the string "===" is found +./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt index 68ad707f3..959acaeee 100644 --- a/examples/batched-bench/CMakeLists.txt +++ b/examples/batched-bench/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index df67c47e3..4a07fe6bb 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -49,12 +49,3 @@ There are 2 modes of operation: | 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 | | 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 | | 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 | - -### JSONL output - -Pass `--output-format jsonl` to output JSONL instead of Markdown, á la - -```json lines -{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094} -{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854} -``` diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 0659ab6f1..718f0a61a 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,28 +1,49 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include +#include #include #include #include -static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); - LOG("\n"); +// mutates the input string +static std::vector parse_list(char * p) { + std::vector ret; + + char * q = p; + + while (*p) { + if (*p == ',') { + *p = '\0'; + ret.push_back(std::atoi(q)); + q = p + 1; + } + + ++p; + } + + ret.push_back(std::atoi(q)); + + return ret; +} + +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); + LOG_TEE("\n"); } int main(int argc, char ** argv) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - common_init(); - int is_pp_shared = params.is_pp_shared; std::vector n_pp = params.n_pp; @@ -36,21 +57,21 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_to_llama(params); + llama_model_params model_params = llama_model_params_from_gpt_params(params); - llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } - llama_context_params ctx_params = common_context_params_to_llama(params); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); // ensure enough sequences are available - ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); + ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); - llama_context * ctx = llama_init_from_model(model, ctx_params); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); @@ -74,11 +95,12 @@ int main(int argc, char ** argv) { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -91,22 +113,21 @@ int main(int argc, char ** argv) { // warm up { for (int i = 0; i < 16; ++i) { - common_batch_add(batch, 0, i, { 0 }, false); + llama_batch_add(batch, 0, i, { 0 }, false); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } } - if (!params.batched_bench_output_jsonl) { - LOG("\n"); - LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); - LOG("\n"); - LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); - LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); - } + LOG_TEE("\n"); + LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("\n"); + + LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); + LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { @@ -121,11 +142,11 @@ int main(int argc, char ** argv) { continue; } - common_batch_clear(batch); + llama_batch_clear(batch); for (int i = 0; i < pp; ++i) { for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { - common_batch_add(batch, 0, i, { j }, false); + llama_batch_add(batch, 0, i, { j }, false); } } batch.logits[batch.n_tokens - 1] = true; @@ -135,7 +156,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } @@ -150,14 +171,14 @@ int main(int argc, char ** argv) { const auto t_tg_start = ggml_time_us(); for (int i = 0; i < tg; ++i) { - common_batch_clear(batch); + llama_batch_clear(batch); for (int j = 0; j < pl; ++j) { - common_batch_add(batch, 0, pp + i, { j }, true); + llama_batch_add(batch, 0, pp + i, { j }, true); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } } @@ -174,31 +195,21 @@ int main(int argc, char ** argv) { const float speed_tg = pl*tg / t_tg; const float speed = n_kv / t; - if(params.batched_bench_output_jsonl) { - LOG( - "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " - "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", - n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, - pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed - ); - } else { - LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); - } + LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); } } } - LOG("\n"); - llama_perf_context_print(ctx); + llama_print_timings(ctx); llama_batch_free(batch); llama_free(ctx); - llama_model_free(model); + llama_free_model(model); llama_backend_free(); - LOG("\n\n"); + fprintf(stderr, "\n\n"); return 0; } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 55c31166c..616494d2d 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -23,17 +23,13 @@ defer { } let model_params = llama_model_default_params() -guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else { +guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { print("Failed to load model") exit(1) } -defer { - llama_model_free(model) -} -guard let vocab = llama_model_get_vocab(model) else { - print("Failed to get vocab") - exit(1) +defer { + llama_free_model(model) } var tokens = tokenize(text: prompt, add_bos: true) @@ -41,36 +37,22 @@ var tokens = tokenize(text: prompt, add_bos: true) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) var context_params = llama_context_default_params() +context_params.seed = 1234 context_params.n_ctx = n_kv_req context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_threads = 8 context_params.n_threads_batch = 8 -let context = llama_init_from_model(model, context_params) +let context = llama_new_context_with_model(model, context_params) guard context != nil else { print("Failed to initialize context") exit(1) } + defer { llama_free(context) } -var sparams = llama_sampler_chain_default_params() - -let smpl = llama_sampler_chain_init(sparams) -guard smpl != nil else { - print("Failed to initialize sampling") - exit(1) -} -defer { - llama_sampler_free(smpl) -} - -llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40)); -llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1)); -llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4)); -llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234)); - let n_ctx = llama_n_ctx(context) print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") @@ -143,10 +125,35 @@ while n_cur <= n_len { continue } - let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) + var n_vocab = llama_n_vocab(model) + var logits = llama_get_logits_ith(context, i_batch[i]) + + var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab)) + + for token_id in 0 ..< n_vocab { + candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0)) + } + + var candidates_p: llama_token_data_array = .init( + data: &candidates, + size: candidates.count, + sorted: false + ) + + let top_k: Int32 = 40 + let top_p: Float = 0.9 + let temp: Float = 0.4 + + llama_sample_top_k(context, &candidates_p, top_k, 1) + llama_sample_top_p(context, &candidates_p, top_p, 1) + llama_sample_temp(context, &candidates_p, temp) + + let new_token_id = llama_sample_token(context, &candidates_p) + + // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? -> mark the stream as finished - if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len { + if llama_token_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { @@ -203,16 +210,15 @@ if n_parallel > 1 { let t_main_end = ggml_time_us() -print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") +print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n") -llama_perf_sampler_print(smpl) -llama_perf_context_print(context) +llama_print_timings(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) var swiftTokens: [llama_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) @@ -223,12 +229,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false) + let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false) if nTokens < 0 { let actualTokensCount = -Int(nTokens) result = .init(repeating: 0, count: actualTokensCount) let check = llama_token_to_piece( - vocab, + model, token, &result, Int32(result.count), diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt index 0d439f498..77e33343b 100644 --- a/examples/batched/CMakeLists.txt +++ b/examples/batched/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-batched) add_executable(${TARGET} batched.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 21b95ef5e..53fbfb0a8 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,30 +1,31 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include +#include #include #include #include -static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); - LOG("\n"); +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG_TEE("\n"); } int main(int argc, char ** argv) { - common_params params; + gpt_params params; params.prompt = "Hello my name is"; params.n_predict = 32; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - common_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -39,67 +40,57 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_to_llama(params); + llama_model_params model_params = llama_model_params_from_gpt_params(params); - llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - LOG_ERR("%s: error: unable to load model\n" , __func__); + fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - // tokenize the prompt std::vector tokens_list; - tokens_list = common_tokenize(vocab, params.prompt, true); + tokens_list = ::llama_tokenize(model, params.prompt, true); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; // initialize the context - llama_context_params ctx_params = common_context_params_to_llama(params); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); - llama_context * ctx = llama_init_from_model(model, ctx_params); - - auto sparams = llama_sampler_chain_default_params(); - sparams.no_perf = false; - - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { - LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return 1; } const int n_ctx = llama_n_ctx(ctx); - LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); - LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__); + LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); + LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - LOG("\n"); + fprintf(stderr, "\n"); for (auto id : tokens_list) { - LOG("%s", common_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } + fflush(stderr); + // create a llama_batch // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); @@ -111,30 +102,30 @@ int main(int argc, char ** argv) { // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - common_batch_add(batch, tokens_list[i], i, seq_ids, false); + llama_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); if (llama_model_has_encoder(model)) { if (llama_encode(ctx, batch)) { - LOG_ERR("%s : failed to eval\n", __func__); + LOG_TEE("%s : failed to eval\n", __func__); return 1; } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == LLAMA_TOKEN_NULL) { - decoder_start_token_id = llama_vocab_bos(vocab); + if (decoder_start_token_id == -1) { + decoder_start_token_id = llama_token_bos(model); } - common_batch_clear(batch); - common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + llama_batch_clear(batch); + llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); } // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } @@ -145,7 +136,7 @@ int main(int argc, char ** argv) { //} if (n_parallel > 1) { - LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); } // main loop @@ -164,7 +155,7 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // prepare the next batch - common_batch_clear(batch); + llama_batch_clear(batch); // sample the next token for each parallel sequence / stream for (int32_t i = 0; i < n_parallel; ++i) { @@ -173,14 +164,36 @@ int main(int argc, char ** argv) { continue; } - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); + auto n_vocab = llama_n_vocab(model); + auto * logits = llama_get_logits_ith(ctx, i_batch[i]); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + const int top_k = 40; + const float top_p = 0.9f; + const float temp = 0.4f; + + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temp (ctx, &candidates_p, temp); + + const llama_token new_token_id = llama_sample_token(ctx, &candidates_p); + + //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of generation? -> mark the stream as finished - if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; - LOG("\n"); + LOG_TEE("\n"); if (n_parallel > 1) { - LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); + LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); } continue; @@ -188,15 +201,16 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); + LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); } - streams[i] += common_token_to_piece(ctx, new_token_id); + streams[i] += llama_token_to_piece(ctx, new_token_id); i_batch[i] = batch.n_tokens; // push this new token for next evaluation - common_batch_add(batch, new_token_id, n_cur, { i }, true); + llama_batch_add(batch, new_token_id, n_cur, { i }, true); n_decode += 1; } @@ -210,35 +224,34 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return 1; } } + LOG_TEE("\n"); + if (n_parallel > 1) { - LOG("\n"); + LOG_TEE("\n"); for (int32_t i = 0; i < n_parallel; ++i) { - LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); + LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); } } const auto t_main_end = ggml_time_us(); - LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG("\n"); - llama_perf_sampler_print(smpl); - llama_perf_context_print(ctx); + llama_print_timings(ctx); fprintf(stderr, "\n"); llama_batch_free(batch); - llama_sampler_free(smpl); llama_free(ctx); - llama_model_free(model); + llama_free_model(model); llama_backend_free(); diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt new file mode 100644 index 000000000..34a58cc02 --- /dev/null +++ b/examples/benchmark/CMakeLists.txt @@ -0,0 +1,6 @@ +set(TARGET llama-bench-matmult) +add_executable(${TARGET} benchmark-matmult.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp new file mode 100644 index 000000000..47cb16c69 --- /dev/null +++ b/examples/benchmark/benchmark-matmult.cpp @@ -0,0 +1,275 @@ +#include "common.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +static float tensor_sum_elements(const ggml_tensor * tensor) { + double sum = 0; + if (tensor->type == GGML_TYPE_F32) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; + } + } + } + return sum; +} + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float sum = tensor_sum_elements(tensor); + printf("Sum of tensor %s is %6.2f\n", name, sum); +} + +#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) + +struct benchmark_params_struct { + int32_t n_threads = 1; + int32_t n_iterations = 10; +}; + +static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); + fprintf(stderr, "\n"); +} + +int main(int argc, char ** argv) { + struct benchmark_params_struct benchmark_params; + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_threads = std::stoi(argv[i]); + } else if (arg == "-i" || arg == "--iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_iterations = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + print_usage(argc, argv, benchmark_params); + exit(0); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv, benchmark_params); + exit(1); + } + + print_build_info(); + printf("Starting Test\n"); + + // create the ggml context + struct ggml_context * ctx; + //const int sizex = 4096; + //const int sizey = 11008; + +#undef VERBOSE_DEBUGGING +#ifndef VERBOSE_DEBUGGING + const int sizey = 4096; + const int sizex = 11008; + const int sizez = 128; +#else + /* Working - let's increase size */ + const int sizey = 1; + const int sizex = (8*32); + const int sizez = 1; + + /*const int sizey = 1; + const int sizex = 3*(8*32); + const int sizez = 1;*/ +#endif + + //printf("Memsize required = %i\n", sizex*sizex); + + // TODO: perform the bench for all types or for a user specified type + const ggml_type qtype = GGML_TYPE_Q4_1; + + size_t ctx_size = 0; + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS + ctx_size += 1024*1024*16; + + printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + ctx = ggml_init(params); + if (!ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return 1; + } + + + printf("Creating new tensors\n"); + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m11, 1.0f); + + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m12, 1.5f); + + // printf("Creating new tensor m2\n"); + struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); + ggml_set_f32(m2, 2.0f); + + printf("\n------ Test 1 - Matrix Mult via F32 code\n"); + // printf("Creating new tensor m11xm2\n"); + struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, m11xm2); + + printf("n_threads=%i\n", benchmark_params.n_threads); + + TENSOR_DUMP(m11); + TENSOR_DUMP(m2); + + std::vector work_buffer; + + ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); + + TENSOR_DUMP(gf->nodes[0]); + + printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); + + int32_t nelements = sizex*sizey; + + // Set up a the benchmark matrices + // printf("Creating new tensor q11 & Running quantize\n"); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); + + // Set up a the compute graph + // printf("Creating new tensor q31\n"); + struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph * gf31 = ggml_new_graph(ctx); + ggml_build_forward_expand(gf31, q31); + + // Set up a second graph computation to make sure we override the CPU cache lines + // printf("Creating new tensor q12 & Running quantize\n"); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); + + // printf("Creating new tensor q32\n"); + struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); + + //printf("Creating compute graph\n"); + struct ggml_cgraph * gf32 = ggml_new_graph(ctx); + ggml_build_forward_expand(gf32, q32); + printf("n_threads=%i\n", benchmark_params.n_threads); + + const int dimx = sizex; + const int dimy = sizey; + const int dimz = sizez; + long long int flops_per_dot_product = dimy + dimy; + long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; + printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); + + + // Let's use the F32 result from above as a reference for the quantized multiplication + float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); + + printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); + printf("=====================================================================================\n"); + + double gflops_sum = 0; + for (int i=0;inodes[0]); + float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); + float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 + + if (delta > allowed_delta) { + printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", + sum_of_F32_reference, + sum_of_Q4_result, + delta, + allowed_delta + ); + exit(0); + } + + // Running a different graph computation to make sure we override the CPU cache lines + ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); + } + printf("\n"); + printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); + printf("=====================================================================================\n"); +} diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index 9d761ebb8..d9cab9836 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -23,9 +23,8 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin" NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt" NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin" -SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\ -'|'\ -'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' +SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+' +SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" CTX_SIZE=2048 @@ -130,12 +129,15 @@ while read -e line; do printf ' ' - if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then + # HACK get num tokens from debug message + # TODO get both messages in one go + if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || + ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then echo >&2 "Couldn't get number of tokens from ./llama-cli output!" exit 1 fi - n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")") + n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg"))) if ((n_tokens > CTX_ROTATE_POINT)); then tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE" diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt index 44e5f722a..a6790e617 100644 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml) add_executable(${TARGET} convert-llama2c-to-ggml.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index 46a42da69..5774ac83c 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -2,8 +2,11 @@ This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. -To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository. +To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository: +`$ make -j` + +After successful compilation, following usage options are available: ``` usage: ./llama-convert-llama2c-to-ggml [options] diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index bdf0eed2a..8ca9f8915 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,6 +1,4 @@ #include "ggml.h" -#include "gguf.h" - #include "llama.h" #include "common.h" #include "log.h" @@ -11,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -108,43 +105,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_ const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; try { w->token_embedding_table.resize(p->vocab_size * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); w->rms_att_weight.resize(p->n_layers * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); w->rms_ffn_weight.resize(p->n_layers * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); w->wq.resize(p->n_layers * p->dim * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wo.resize(p->n_layers * p->dim * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->w1.resize(p->n_layers * p->hidden_dim * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->w2.resize(p->n_layers * p->hidden_dim * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); w->w3.resize(p->n_layers * p->hidden_dim * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->rms_final_weight.resize(p->dim); - LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); if (shared_weights) { w->wcls = {}; } else { w->wcls.resize(p->vocab_size * p->dim); - LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); } } catch (std::length_error &) { @@ -176,7 +173,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); + LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -184,26 +181,26 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL } static void print_sample_weights(TransformerWeights *w){ - LOG_INF("----- Quick print of first of the weight vales of all the variables\n"); - LOG_INF("%f\n", w->token_embedding_table[0]); - LOG_INF("%f\n", w->rms_att_weight[0]); - LOG_INF("%f\n", w->rms_ffn_weight[0]); + LOG("----- Quick print of first of the weight vales of all the variables\n"); + LOG("%f\n", w->token_embedding_table[0]); + LOG("%f\n", w->rms_att_weight[0]); + LOG("%f\n", w->rms_ffn_weight[0]); - LOG_INF("%f\n", w->wq[0]); - LOG_INF("%f\n", w->wk[0]); - LOG_INF("%f\n", w->wv[0]); - LOG_INF("%f\n", w->wo[0]); - LOG_INF("%f\n", w->w1[0]); - LOG_INF("%f\n", w->w2[0]); - LOG_INF("%f\n", w->w3[0]); - LOG_INF("%f\n", w->rms_att_weight[0]); - if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]); + LOG("%f\n", w->wq[0]); + LOG("%f\n", w->wk[0]); + LOG("%f\n", w->wv[0]); + LOG("%f\n", w->wo[0]); + LOG("%f\n", w->w1[0]); + LOG("%f\n", w->w2[0]); + LOG("%f\n", w->w3[0]); + LOG("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model. -struct my_llama_vocab { +struct llama_vocab { using id = int32_t; using token = std::string; using ttype = llama_token_type; @@ -321,20 +318,20 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab); - LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx); - LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd); - LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult); - LOG_INF("%s: n_head: %u\n", __func__, params->n_head); - LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff); - LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer); - LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot); + LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG("%s: n_embd: %u\n", __func__, params->n_embd); + LOG("%s: n_mult: %u\n", __func__, params->n_mult); + LOG("%s: n_head: %u\n", __func__, params->n_head); + LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG("%s: n_ff: %u\n", __func__, params->n_ff); + LOG("%s: n_layer: %u\n", __func__, params->n_layer); + LOG("%s: n_rot: %u\n", __func__, params->n_rot); } static void print_tensor_info(const struct ggml_context * ctx) { for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - LOG_INF("%s: Allocating ", __func__); + LOG("%s: Allocating ", __func__); int64_t total = 1; int i = 0; for (; i < ggml_n_dims(t); ++i) { @@ -436,12 +433,12 @@ static void print_matrix(struct ggml_tensor * probs) { } } -struct my_llama_file { +struct llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; - my_llama_file(const char * fname, const char * mode) { + llama_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { size = 0; @@ -502,7 +499,7 @@ struct my_llama_file { return std::string(chars.data(), len); } - ~my_llama_file() { + ~llama_file() { if (fp) { std::fclose(fp); } @@ -510,7 +507,7 @@ struct my_llama_file { }; static bool is_ggml_file(const char * filename) { - my_llama_file file(filename, "rb"); + llama_file file(filename, "rb"); if (file.size < 4) { return false; } @@ -527,9 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) { return out.str(); } -static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) { +static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { - LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); + LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -577,21 +574,21 @@ static void load_vocab(const char * filename, const Config * config, struct my_l gguf_free(ctx); } else { // assume llama2.c vocabulary - LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); - my_llama_file file(filename, "rb"); + LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); + llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); } const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); - for (my_llama_vocab::id id=0; idtoken_embedding_table -> model->tok_embeddings @@ -673,7 +670,7 @@ static void save_as_llama_model( std::vector tokens; std::vector scores; std::vector token_types; - for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) { + for (const llama_vocab::token_data & token_data : vocab->id_to_token) { tokens.push_back(token_data.text.c_str()); scores.push_back(token_data.score); token_types.push_back(token_data.type); @@ -691,8 +688,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL); - gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); @@ -874,25 +871,23 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { - common_init(); - struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { return 1; } - + log_set_target(stdout); Config config; TransformerWeights weights = {}; { - LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); FILE * file = fopen(params.fn_llama2c_model, "rb"); if (!file) { - LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); return 1; } // read in the config header if (fread(&config, sizeof(Config), 1, file) != 1) { - LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); return 1; } auto shared_weights = config.vocab_size > 0; @@ -901,17 +896,17 @@ int main(int argc, char ** argv) { // read in the Transformer weights alloc_weights(&weights, &config, shared_weights); if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { - LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); return 1; } fclose(file); } - struct my_llama_vocab vocab; + struct llama_vocab vocab; load_vocab(params.fn_vocab_model, &config, &vocab); struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx); + model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_ff = config.hidden_dim; @@ -934,7 +929,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c4ec5c524..9ab9ab06e 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -840,8 +840,6 @@ class OutputFile: self.gguf.add_base_model_version(key, base_model_entry["version"]) if "organization" in base_model_entry: self.gguf.add_base_model_organization(key, base_model_entry["organization"]) - if "description" in base_model_entry: - self.gguf.add_base_model_description(key, base_model_entry["description"]) if "url" in base_model_entry: self.gguf.add_base_model_url(key, base_model_entry["url"]) if "doi" in base_model_entry: @@ -851,32 +849,12 @@ class OutputFile: if "repo_url" in base_model_entry: self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) - if metadata.datasets is not None: - self.gguf.add_dataset_count(len(metadata.datasets)) - for key, dataset_entry in enumerate(metadata.datasets): - if "name" in dataset_entry: - self.gguf.add_dataset_name(key, dataset_entry["name"]) - if "author" in dataset_entry: - self.gguf.add_dataset_author(key, dataset_entry["author"]) - if "version" in dataset_entry: - self.gguf.add_dataset_version(key, dataset_entry["version"]) - if "organization" in dataset_entry: - self.gguf.add_dataset_organization(key, dataset_entry["organization"]) - if "description" in dataset_entry: - self.gguf.add_dataset_description(key, dataset_entry["description"]) - if "url" in dataset_entry: - self.gguf.add_dataset_url(key, dataset_entry["url"]) - if "doi" in dataset_entry: - self.gguf.add_dataset_doi(key, dataset_entry["doi"]) - if "uuid" in dataset_entry: - self.gguf.add_dataset_uuid(key, dataset_entry["uuid"]) - if "repo_url" in dataset_entry: - self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"]) - if metadata.tags is not None: self.gguf.add_tags(metadata.tags) if metadata.languages is not None: self.gguf.add_languages(metadata.languages) + if metadata.datasets is not None: + self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt index 49ad9561c..0a559d60c 100644 --- a/examples/cvector-generator/CMakeLists.txt +++ b/examples/cvector-generator/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 413b71d34..d4e126ac2 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,9 +1,6 @@ -#include "ggml.h" -#include "gguf.h" - -#include "arg.h" #include "common.h" #include "llama.h" +#include "ggml.h" #include "pca.hpp" #include "mean.hpp" @@ -15,15 +12,14 @@ #include "ggml-metal.h" #endif -#include -#include #include -#include -#include -#include #include #include #include +#include +#include +#include +#include ////////////////////////////////////////////////// @@ -33,13 +29,15 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += common_token_to_piece(ctx, *begin); + ret += llama_token_to_piece(ctx, *begin); } return ret; } -static void print_usage(int, char ** argv) { +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); @@ -273,11 +271,9 @@ struct tokenized_prompt { size_t max_seq_len; tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - const bool add_bos = llama_vocab_get_add_bos(vocab); - tokens_pos = common_tokenize(ctx, pos, add_bos, true); - tokens_neg = common_tokenize(ctx, neg, add_bos, true); + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); + tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); @@ -285,7 +281,7 @@ struct tokenized_prompt { void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { // TODO: customize padding token - std::vector pad_tokens = common_tokenize(ctx, " ", false); + std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); llama_token pad_tok = pad_tokens.back(); while (tokens.size() < len) { tokens.push_back(pad_tok); @@ -343,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { llama_kv_cache_clear(ctx); - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -374,7 +370,7 @@ static void export_gguf(const std::vector & v_ctrl, const * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ -static int prepare_entries(common_params & params, train_context & ctx_train) { +static int prepare_entries(gpt_params & params, train_context & ctx_train) { // load prompts std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); @@ -392,9 +388,10 @@ static int prepare_entries(common_params & params, train_context & ctx_train) { } int main(int argc, char ** argv) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } @@ -417,15 +414,13 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); // int n_ctx = llama_n_ctx(ctx); - int n_layers = llama_model_n_layer(model); - int n_embd = llama_model_n_embd(model); - + int n_layers = llama_n_layer(model); + int n_embd = llama_n_embd(model); // get model hint param (a.k.a model arch name) char model_hint[128]; llama_model_meta_val_str(model, "general.architecture", model_hint, 128); @@ -479,6 +474,8 @@ int main(int argc, char ** argv) { // done with the model, we can now free it to make gain some memory printf("Done evaluate prompts, unload model...\n"); + llama_free(ctx); + llama_free_model(model); bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; @@ -488,8 +485,8 @@ int main(int argc, char ** argv) { if (use_pca) { // run PCA PCA::pca_params pca_params; - pca_params.n_threads = params.cpuparams.n_threads; - pca_params.n_batch = params.n_pca_batch; + pca_params.n_threads = params.n_threads; + pca_params.n_batch = params.n_pca_batch; pca_params.n_iterations = params.n_pca_iterations; PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); } else { diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp index 4eeac1eeb..16be5ce3e 100644 --- a/examples/cvector-generator/mean.hpp +++ b/examples/cvector-generator/mean.hpp @@ -15,7 +15,7 @@ static void run( for (size_t il = 0; il < v_input.size(); ++il) { // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%zu", il+1); + ggml_format_name(ctrl_out, "direction.%ld", il+1); // calculate mean vector struct ggml_tensor * t_layer = v_input[il]; diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index e88bbdde9..6ec3141af 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -12,9 +12,12 @@ #include #include -#include #include +#include #include +#include +#include +#include #define DEBUG_POS 5 @@ -204,6 +207,13 @@ static ggml_status compute_piter( ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); } +// TODO: enable GPU support when support for GGML_OP_SQRT is added +//#ifdef GGML_USE_METAL +// if (ggml_backend_is_metal(model.backend)) { +// ggml_backend_metal_set_n_cb(model.backend, params.n_threads); +// } +//#endif + ggml_status res = ggml_backend_graph_compute(model.backend, gf); if (res == GGML_STATUS_SUCCESS) { auto extract_i = [](std::string prefix, std::string str) -> int { @@ -219,8 +229,8 @@ static ggml_status compute_piter( result.eigenvectors.resize(params.n_batch); result.distances.resize(params.n_batch); // get output nodes - for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { - auto node = ggml_graph_node(gf, i); + for (int i = 0; i < gf->n_nodes; ++i) { + auto node = gf->nodes[i]; int iter = -1; // find b_tensor (without copying data from device) if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { @@ -302,7 +312,7 @@ static void run_pca( // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%zu", il+1); + ggml_format_name(ctrl_out, "direction.%ld", il+1); // run power_iteration params.i_layer = il; diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md index 59918ec2b..1e20feb4a 100644 --- a/examples/deprecation-warning/README.md +++ b/examples/deprecation-warning/README.md @@ -13,6 +13,7 @@ Please update all scripts and workflows to use the new binary names. | server | llama-server | | llama-bench | llama-bench | | embedding | llama-embedding | +| finetune | llama-finetune | | quantize | llama-quantize | | tokenize | llama-tokenize | | export-lora | llama-export-lora | @@ -44,6 +45,7 @@ Please update all scripts and workflows to use the new binary names. | save-load-state | llama-save-load-state | | simple | llama-simple | | speculative | llama-speculative | +| train-text-from-scratch | llama-train-text-from-scratch | | vdot | llama-vdot | | tests/test-c.o | tests/test-c.o | diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp index c2958ea12..11b35d2c2 100644 --- a/examples/deprecation-warning/deprecation-warning.cpp +++ b/examples/deprecation-warning/deprecation-warning.cpp @@ -12,7 +12,7 @@ int main(int argc, char** argv) { } // Get only the program name from the full path - auto pos = filename.find_last_of("/\\"); + auto pos = filename.find_last_of('/'); if (pos != std::string::npos) { filename = filename.substr(pos+1); } diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 809040307..8256e789a 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 12b372bf1..e3705b454 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null +./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null +llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. @@ -50,11 +50,11 @@ The above command will output space-separated float values. ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38d22c90f..1466e5b2b 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,6 +1,4 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include @@ -28,29 +26,18 @@ static std::vector split_lines(const std::string & s, const std::st static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - common_batch_add(batch, tokens[i], i, { seq_id }, true); + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const struct llama_model * model = llama_get_model(ctx); - // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); // run model - LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { - // encoder-only model - if (llama_encode(ctx, batch) < 0) { - LOG_ERR("%s : failed to encode\n", __func__); - } - } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { - // decoder-only model - if (llama_decode(ctx, batch) < 0) { - LOG_ERR("%s : failed to decode\n", __func__); - } + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); } for (int i = 0; i < batch.n_tokens; i++) { @@ -58,74 +45,68 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu continue; } - const float * embd = nullptr; - int embd_pos = 0; + // try to get sequence embeddings - supported only when pooling_type is not NONE + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - // try to get token embeddings - embd = llama_get_embeddings_ith(ctx, i); - embd_pos = i; - GGML_ASSERT(embd != NULL && "failed to get token embeddings"); - } else { - // try to get sequence embeddings - supported only when pooling_type is not NONE - embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - embd_pos = batch.seq_id[i][0]; - GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); - } - - float * out = output + embd_pos * n_embd; - common_embd_normalize(embd, out, n_embd, embd_norm); + float * out = output + batch.seq_id[i][0] * n_embd; + llama_embd_normalize(embd, out, n_embd, embd_norm); } } int main(int argc, char ** argv) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - params.embedding = true; // For non-causal models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + llama_backend_init(); llama_numa_init(params.numa); + llama_model * model; + llama_context * ctx; + // load the model - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { - LOG_ERR("%s: unable to load model\n", __func__); + fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - - if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { - LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } // split the prompt into lines @@ -138,9 +119,9 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = common_tokenize(ctx, prompt, true, true); + auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { - LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } @@ -150,21 +131,21 @@ int main(int argc, char ** argv) { // check if the last token is SEP // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) { - LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); - LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); + if (inp.empty() || inp.back() != llama_token_sep(model)) { + fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); + fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); } } // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { - LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); - LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); for (int j = 0; j < (int) inputs[i].size(); j++) { - LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); } - LOG("\n\n"); + fprintf(stderr, "\n\n"); } } @@ -172,23 +153,13 @@ int main(int argc, char ** argv) { const int n_prompts = prompts.size(); struct llama_batch batch = llama_batch_init(n_batch, 0, 1); - // count number of embeddings - int n_embd_count = 0; - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - for (int k = 0; k < n_prompts; k++) { - n_embd_count += inputs[k].size(); - } - } else { - n_embd_count = n_prompts; - } - // allocate output - const int n_embd = llama_model_n_embd(model); - std::vector embeddings(n_embd_count * n_embd, 0); + const int n_embd = llama_n_embd(model); + std::vector embeddings(n_prompts * n_embd, 0); float * emb = embeddings.data(); // break into batches - int e = 0; // number of embeddings already stored + int p = 0; // number of prompts processed already int s = 0; // number of prompts in current batch for (int k = 0; k < n_prompts; k++) { // clamp to n_batch tokens @@ -198,11 +169,11 @@ int main(int argc, char ** argv) { // encode if at capacity if (batch.n_tokens + n_toks > n_batch) { - float * out = emb + e * n_embd; + float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; + llama_batch_clear(batch); + p += s; s = 0; - common_batch_clear(batch); } // add to batch @@ -211,67 +182,39 @@ int main(int argc, char ** argv) { } // final batch - float * out = emb + e * n_embd; + float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); if (params.embd_out.empty()) { - LOG("\n"); + // print the first part of the embeddings or for a single prompt, the full embedding + fprintf(stdout, "\n"); + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, "\n"); + } - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - for (int j = 0; j < n_embd_count; j++) { - LOG("embedding %d: ", j); - for (int i = 0; i < std::min(3, n_embd); i++) { - if (params.embd_normalize == 0) { - LOG("%6.0f ", emb[j * n_embd + i]); - } else { - LOG("%9.6f ", emb[j * n_embd + i]); - } - } - LOG(" ... "); - for (int i = n_embd - 3; i < n_embd; i++) { - if (params.embd_normalize == 0) { - LOG("%6.0f ", emb[j * n_embd + i]); - } else { - LOG("%9.6f ", emb[j * n_embd + i]); - } - } - LOG("\n"); + // print cosine similarity matrix + if (n_prompts > 1) { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + fprintf(stdout, "%6.6s ", prompts[i].c_str()); } - } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) { - for (int j = 0; j < n_embd_count; j++) { - // NOTE: if you change this log - update the tests in ci/run.sh - LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); - } - } else { - // print the first part of the embeddings or for a single prompt, the full embedding - for (int j = 0; j < n_prompts; j++) { - LOG("embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - if (params.embd_normalize == 0) { - LOG("%6.0f ", emb[j * n_embd + i]); - } else { - LOG("%9.6f ", emb[j * n_embd + i]); - } - } - LOG("\n"); - } - - // print cosine similarity matrix - if (n_prompts > 1) { - LOG("\n"); - LOG("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - LOG("%6.6s ", prompts[i].c_str()); - } - LOG("\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - LOG("%6.2f ", sim); - } - LOG("%1.10s", prompts[i].c_str()); - LOG("\n"); + fprintf(stdout, "\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); } + fprintf(stdout, "%1.10s", prompts[i].c_str()); + fprintf(stdout, "\n"); } } } @@ -279,46 +222,46 @@ int main(int argc, char ** argv) { if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { const bool notArray = params.embd_out != "array"; - LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); + fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); for (int j = 0;;) { // at least one iteration (one prompt) - if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); - LOG("["); + if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); + fprintf(stdout, "["); for (int i = 0;;) { // at least one iteration (n_embd > 0) - LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); + fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); i++; - if (i < n_embd) LOG(","); else break; + if (i < n_embd) fprintf(stdout, ","); else break; } - LOG(notArray ? "]\n }" : "]"); + fprintf(stdout, notArray ? "]\n }" : "]"); j++; - if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break; + if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break; } - LOG(notArray ? "\n ]" : "]\n"); + fprintf(stdout, notArray ? "\n ]" : "]\n"); if (params.embd_out == "json+" && n_prompts > 1) { - LOG(",\n \"cosineSimilarity\": [\n"); - for (int i = 0;;) { // at least two iteration (n_embd_count > 1) - LOG(" ["); - for (int j = 0;;) { // at least two iteration (n_embd_count > 1) - float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - LOG("%6.2f", sim); + fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); + for (int i = 0;;) { // at least two iteration (n_prompts > 1) + fprintf(stdout, " ["); + for (int j = 0;;) { // at least two iteration (n_prompts > 1) + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f", sim); j++; - if (j < n_embd_count) LOG(", "); else break; + if (j < n_prompts) fprintf(stdout, ", "); else break; } - LOG(" ]"); + fprintf(stdout, " ]"); i++; - if (i < n_embd_count) LOG(",\n"); else break; + if (i < n_prompts) fprintf(stdout, ",\n"); else break; } - LOG("\n ]"); + fprintf(stdout, "\n ]"); } - if (notArray) LOG("\n}\n"); + if (notArray) fprintf(stdout, "\n}\n"); } - LOG("\n"); - llama_perf_context_print(ctx); - // clean up + llama_print_timings(ctx); llama_batch_free(batch); + llama_free(ctx); + llama_free_model(model); llama_backend_free(); return 0; diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index 95915ed91..a48753d38 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -2,9 +2,8 @@ set(TARGET llama-eval-callback) add_executable(${TARGET} eval-callback.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} - COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index fb188f5a9..c8a3016a4 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,11 +1,11 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include "ggml.h" #include +#include #include +#include #include /** @@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG(" [\n"); + printf(" [\n"); for (int64_t i2 = 0; i2 < ne[2]; i2++) { if (i2 == n && ne[2] > 2*n) { - LOG(" ..., \n"); + printf(" ..., \n"); i2 = ne[2] - n; } - LOG(" [\n"); + printf(" [\n"); for (int64_t i1 = 0; i1 < ne[1]; i1++) { if (i1 == n && ne[1] > 2*n) { - LOG(" ..., \n"); + printf(" ..., \n"); i1 = ne[1] - n; } - LOG(" ["); + printf(" ["); for (int64_t i0 = 0; i0 < ne[0]; i0++) { if (i0 == n && ne[0] > 2*n) { - LOG("..., "); + printf("..., "); i0 = ne[0] - n; } size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; @@ -62,18 +62,18 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; } else { - GGML_ABORT("fatal error"); + GGML_ASSERT(false); } - LOG("%12.4f", v); + printf("%12.4f", v); sum += v; - if (i0 < ne[0] - 1) LOG(", "); + if (i0 < ne[0] - 1) printf(", "); } - LOG("],\n"); + printf("],\n"); } - LOG(" ],\n"); + printf(" ],\n"); } - LOG(" ]\n"); - LOG(" sum = %f\n", sum); + printf(" ]\n"); + printf(" sum = %f\n", sum); } } @@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); } - LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); + printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); // copy the data from the GPU memory if needed @@ -126,16 +126,13 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { return true; } -static bool run(llama_context * ctx, const common_params & params) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); +static bool run(llama_context * ctx, const gpt_params & params) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - const bool add_bos = llama_vocab_get_add_bos(vocab); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); - std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); - - if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { - LOG_ERR("%s : failed to eval\n", __func__); + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -145,13 +142,16 @@ static bool run(llama_context * ctx, const common_params & params) { int main(int argc, char ** argv) { callback_data cb_data; - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); + print_build_info(); + + std::mt19937 rng(params.seed); llama_backend_init(); llama_numa_init(params.numa); @@ -163,21 +163,18 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr || ctx == nullptr) { - LOG_ERR("%s : failed to init\n", __func__); + fprintf(stderr, "%s : failed to init\n", __func__); return 1; } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } bool OK = run(ctx, params); @@ -185,8 +182,10 @@ int main(int argc, char ** argv) { return 1; } - LOG("\n"); - llama_perf_context_print(ctx); + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); llama_backend_free(); diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt index 310455787..1cef6e716 100644 --- a/examples/export-lora/CMakeLists.txt +++ b/examples/export-lora/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 7dce99c9a..6d51f4b24 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -17,17 +17,9 @@ For example: ```bash ./bin/llama-export-lora \ - -m open-llama-3b-v2.gguf \ - -o open-llama-3b-v2-english2tokipona-chat.gguf \ - --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf + -m open-llama-3b-v2-q8_0.gguf \ + -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ + --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin ``` -Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: - -```bash -./bin/llama-export-lora \ - -m your_base_model.gguf \ - -o your_merged_model.gguf \ - --lora-scaled lora_task_A.gguf 0.5 \ - --lora-scaled lora_task_B.gguf 0.5 -``` +Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters. diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 91238e4be..124ee167d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,23 +1,15 @@ +#include "common.h" #include "ggml.h" #include "ggml-alloc.h" -#include "gguf.h" - -#include "arg.h" -#include "common.h" #include #include #include +#include #include static bool g_verbose = false; -struct tensor_transformation { - struct ggml_tensor * in; - struct ggml_tensor * out; - bool is_copy; -}; - static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ int id = gguf_find_key(ctx_gguf, key.c_str()); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); @@ -58,6 +50,20 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context return ctx_gguf; } +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} + struct file_input { struct ggml_context * ctx_meta = nullptr; struct gguf_context * ctx_gguf = nullptr; @@ -129,7 +135,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector & lora_files, + std::vector> & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors @@ -138,9 +144,9 @@ struct lora_merge_ctx { throw std::runtime_error("split model is not yet supported"); } - for (auto & lora_inp : lora_files) { - auto fname = lora_inp.path; - auto scale = lora_inp.scale; + for (auto lora_inp : lora_files) { + auto fname = std::get<0>(lora_inp); + auto scale = std::get<1>(lora_inp); std::unique_ptr adapter(new file_input(fname, scale)); check_metadata_lora(adapter.get()); adapters.push_back(std::move(adapter)); @@ -205,8 +211,8 @@ struct lora_merge_ctx { } } - // mapping base tensor to out tensor (same shape with base, but different type) - std::vector trans; + // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile + std::vector> base_tensors; for (auto & it : base_model.tensors) { bool t_a = true; bool t_b = true; @@ -215,30 +221,22 @@ struct lora_merge_ctx { t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); } auto base_tensor = it.second; + struct ggml_tensor * out_tensor; if (!t_a && !t_b) { // only copy - struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); - ggml_set_name(cpy_tensor, base_tensor->name); - trans.push_back({ - cpy_tensor, - cpy_tensor, - true, - }); - gguf_add_tensor(ctx_out, cpy_tensor); + out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(out_tensor, base_tensor->name); + base_tensors.push_back(std::make_pair(out_tensor, false)); } else if (t_a && t_b) { // need merging - struct ggml_tensor * out_tensor = ggml_new_tensor( - ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); + out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + out_tensor->type = get_out_tensor_type(base_tensor); ggml_set_name(out_tensor, base_tensor->name); - trans.push_back({ - base_tensor, - out_tensor, - false, - }); - gguf_add_tensor(ctx_out, out_tensor); + base_tensors.push_back(std::make_pair(out_tensor, true)); } else { throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); } + gguf_add_tensor(ctx_out, out_tensor); } // placeholder for the meta data @@ -249,12 +247,12 @@ struct lora_merge_ctx { // process base model tensors size_t n_merged = 0; - for (auto & it : trans) { - if (!it.is_copy) { - merge_tensor(it.in, it.out); + for (auto & it : base_tensors) { + if (it.second) { + merge_tensor(it.first); n_merged++; } else { - copy_tensor(it.in); + copy_tensor(it.first); } } @@ -266,8 +264,8 @@ struct lora_merge_ctx { fout.write((const char *)data.data(), data.size()); } - printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %zu tensors to output file\n", __func__, trans.size()); + printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -278,7 +276,7 @@ struct lora_merge_ctx { zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); } - void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { + void merge_tensor(struct ggml_tensor * base) { std::string name_base(base->name); std::string name_lora_a = name_base + ".lora_a"; std::string name_lora_b = name_base + ".lora_b"; @@ -289,41 +287,25 @@ struct lora_merge_ctx { std::vector inp_a(adapters.size()); std::vector inp_b(adapters.size()); struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), + /*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; struct ggml_context * ctx = ggml_init(params); // alloc tensors - struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); + struct ggml_tensor * inp = ggml_dup_tensor(ctx, base); for (size_t i = 0; i < adapters.size(); ++i) { auto t_a = adapters[i]->get_tensor(name_lora_a); auto t_b = adapters[i]->get_tensor(name_lora_b); - // TODO: add support for quantized lora - if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) { - throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32"); - } inp_a[i] = ggml_dup_tensor(ctx, t_a); inp_b[i] = ggml_dup_tensor(ctx, t_b); } ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - // load base tensor to backend buffer + // load data to backend buffer base_model.read_tensor_data(name_base, read_buf); - if (base->type != GGML_TYPE_F32) { - // optionally dequantize it - printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); - auto nels = ggml_nelements(inp_base); - const auto * qtype = ggml_get_type_traits(base->type); - std::vector dequant_buf(nels * sizeof(float)); - qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); - ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); - } else { - ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); - } - - // load lora tensors to backend buffer + ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp)); for (size_t i = 0; i < adapters.size(); ++i) { adapters[i]->read_tensor_data(name_lora_a, read_buf); ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); @@ -343,31 +325,20 @@ struct lora_merge_ctx { }; struct ggml_context * ctx0 = ggml_init(params0); gf = ggml_new_graph(ctx0); - struct ggml_tensor * cur = inp_base; + struct ggml_tensor * cur = inp; for (size_t i = 0; i < adapters.size(); ++i) { - struct ggml_tensor * delta; - bool is_tok_embd = string_starts_with(name_base, "token_embd"); - if (is_tok_embd) { - printf("%s : detected token embeddings tensor\n", __func__); - delta = ggml_mul_mat(ctx0, - ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32), - ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)); - } else { - delta = ggml_mul_mat(ctx0, - ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))), - ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); - } + struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i])); + struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]); // scale const float alpha = adapters[i]->alpha; const float rank = (float) inp_b[i]->ne[0]; const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; delta = ggml_scale(ctx0, delta, scale); - cur = ggml_add(ctx0, delta, cur); - printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); + cur = ggml_add(ctx0, cur, delta); + printf("%s : + merging from adapter[%ld]\n", __func__, i); printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); } - cur = ggml_cast(ctx0, cur, out->type); - printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); + cur = ggml_cast(ctx0, cur, get_out_tensor_type(base)); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); } @@ -381,7 +352,7 @@ struct lora_merge_ctx { // write data to output file { - auto * result = ggml_graph_node(gf, -1); + auto result = gf->nodes[gf->n_nodes - 1]; size_t len = ggml_nbytes(result); if (read_buf.size() < len) { read_buf.resize(len); @@ -403,7 +374,9 @@ struct lora_merge_ctx { } }; -static void print_usage(int, char ** argv) { +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + printf("\nexample usage:\n"); printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); printf("\nNOTE: output model is F16\n"); @@ -411,15 +384,16 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - g_verbose = (params.verbosity > 1); + g_verbose = (params.verbosity == 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); + lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt new file mode 100644 index 000000000..64afe6ddc --- /dev/null +++ b/examples/finetune/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-finetune) +add_executable(${TARGET} finetune.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/README.md b/examples/finetune/README.md new file mode 100644 index 000000000..1c27df053 --- /dev/null +++ b/examples/finetune/README.md @@ -0,0 +1,90 @@ +# finetune + +Basic usage instructions: + +```bash +# get training data +wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt + +# finetune LORA adapter +./bin/llama-finetune \ + --model-base open-llama-3b-v2-q8_0.gguf \ + --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ + --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ + --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \ + --train-data "shakespeare.txt" \ + --save-every 10 \ + --threads 6 --adam-iter 30 --batch 4 --ctx 64 \ + --use-checkpointing + +# predict +./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin +``` + +**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). +The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. +So in above example after 10 iterations these files will be written: +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf +- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin +- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin + +After 10 more iterations: +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf +- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin +- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin + +Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. + +llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. +These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above. + +In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together. + +For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: + +```bash +./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ + --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ + --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin +``` + +You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`. + +For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: + +```bash +./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ + --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ + --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ + --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin +``` + +The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values. + +Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. +If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. + +The default LORA rank can be specified with `--lora-r N`. +The LORA rank can be configured for each model tensor type separately with these command line options: + +```bash + --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4) + --rank-att-norm N LORA rank for attention norm tensor (default 1) + --rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1) + --rank-out-norm N LORA rank for output norm tensor (default 1) + --rank-tok-embd N LORA rank for token embeddings tensor (default 4) + --rank-out N LORA rank for output tensor (default 4) + --rank-wq N LORA rank for wq tensor (default 4) + --rank-wk N LORA rank for wk tensor (default 4) + --rank-wv N LORA rank for wv tensor (default 4) + --rank-wo N LORA rank for wo tensor (default 4) + --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) + --rank-ffn_down N LORA rank for ffn_down tensor (default 4) + --rank-ffn_up N LORA rank for ffn_up tensor (default 4) +``` + +The LORA rank of 'norm' tensors should always be 1. + +To see all available options use `llama-finetune --help`. diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py new file mode 100644 index 000000000..1b79d6995 --- /dev/null +++ b/examples/finetune/convert_finetune_checkpoint_to_gguf.py @@ -0,0 +1,487 @@ +#!/usr/bin/env python3 +# finetune checkpoint --> gguf conversion + +import argparse +import gguf +import struct +import numpy as np +from pathlib import Path + +# gguf constants +LLM_KV_OPTIMIZER_TYPE = "optimizer.type" +LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" +LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" +LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" +LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" +LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" +LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" +LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" +LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" +LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" +LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" +LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" +LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" +LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" + +LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" +LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" +LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" + +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" +LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" + +LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" +LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" +LLM_KV_TRAINING_TYPE = "training.type" +LLM_KV_TRAINING_FILE_VERSION = "training.file_version" +LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" +LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" +LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" + +LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd" +LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm" +LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output" +LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm" +LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q" +LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k" +LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v" +LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output" +LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm" +LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate" +LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down" +LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up" + +class Tensor: + def __init__(self, dtype='f', ne=None): + if ne is None: + ne = [] + self.dtype = dtype + self.ne = ne + self.nbytes = 0 + if self.dtype == 'f': + if len(self.ne) == 0: + self.nbytes = 0 + else: + self.nbytes = int(np.prod(self.ne)) * 4 + else: + raise ValueError(f"Unhandled data type '{self.dtype}'") + + def load(self, data, offset): + nd = struct.unpack(' 0 else []) + + self.lbfgs_x = Tensor('f', [self.nx]) + self.lbfgs_xp = Tensor('f', [self.nx]) + self.lbfgs_g = Tensor('f', [self.nx]) + self.lbfgs_gp = Tensor('f', [self.nx]) + self.lbfgs_d = Tensor('f', [self.nx]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) + self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) + self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) + + # forgot to save type in version 1: + # guess self.type from number of remaining bytes + size_type_0 = 12 + sum([t.max_storage_size() for t in + [self.adam_m, self.adam_v] + +([self.adam_pf] if (self.past > 0) else [])]) + size_type_1 = 24 + sum([t.max_storage_size() for t in + [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, + self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, + self.lbfgs_lmal, self.lbfgs_lmys, + self.lbfgs_lms, self.lbfgs_lmy] + +([self.lbfgs_pf] if (self.past > 0) else [])]) + # due to alignment padding the size might not by exact + # but the difference in size for both types is significant, + # so we can just use whichever is closest + remaining = len(data) - offset + if abs(remaining - size_type_0) < abs(remaining - size_type_1): + self.type = 0 + else: + self.type = 1 + + if self.type == 0: + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = self.adam_pf.load(data,offset) + + self.adam_fx_best = struct.unpack(' 0: + self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) + + elif self.type == 1: + gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) + + self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) + self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) + self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) + self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) + self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) + if self.past > 0: + self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) + self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) + self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) + self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) + self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) + else: + raise ValueError('Unknown optimizer type') + +class LoraParams: + def __init__(self): + pass + + def load(self, data, offset): + self.n_rank_attention_norm = struct.unpack(' +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct my_llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; + uint32_t n_embd = 4096; + uint32_t n_ff = 11008; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + + // float f_norm_eps = 1e-5f; // falcon + float f_norm_rms_eps = 1e-5f; // llama + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); + } + + bool operator!=(const my_llama_hparams& other) const { + return memcmp(this, &other, sizeof(other)); + } +}; + +struct my_llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 +}; + +struct my_llama_model { + struct my_llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; +}; + +struct my_llama_lora_hparams { + uint32_t lora_r = 1; + uint32_t lora_alpha = 1; + uint32_t n_rank_attention_norm = 1; + uint32_t n_rank_wq = 4; + uint32_t n_rank_wk = 4; + uint32_t n_rank_wv = 4; + uint32_t n_rank_wo = 4; + uint32_t n_rank_ffn_norm = 1; + uint32_t n_rank_ffn_gate = 4; + uint32_t n_rank_ffn_down = 4; + uint32_t n_rank_ffn_up = 4; + uint32_t n_rank_tok_embeddings = 4; + uint32_t n_rank_norm = 1; + uint32_t n_rank_output = 4; + + bool operator!=(const my_llama_lora_hparams& other) const { + return memcmp(this, &other, sizeof(other)); + } +}; + +struct my_llama_lora_layer { + // normalization + struct ggml_tensor * attention_norm_a; + struct ggml_tensor * attention_norm_b; + + // attention + struct ggml_tensor * wq_a; + struct ggml_tensor * wq_b; + struct ggml_tensor * wk_a; + struct ggml_tensor * wk_b; + struct ggml_tensor * wv_a; + struct ggml_tensor * wv_b; + struct ggml_tensor * wo_a; + struct ggml_tensor * wo_b; + + // normalization + struct ggml_tensor * ffn_norm_a; + struct ggml_tensor * ffn_norm_b; + + // ff + struct ggml_tensor * ffn_gate_a; + struct ggml_tensor * ffn_gate_b; + struct ggml_tensor * ffn_down_a; + struct ggml_tensor * ffn_down_b; + struct ggml_tensor * ffn_up_a; + struct ggml_tensor * ffn_up_b; +}; + +struct my_llama_lora { + struct ggml_context * ctx = NULL; + ggml_backend_buffer_t data; + + my_llama_lora_hparams hparams; + + struct ggml_tensor * tok_embeddings_a; + struct ggml_tensor * tok_embeddings_b; + + struct ggml_tensor * norm_a; + struct ggml_tensor * norm_b; + struct ggml_tensor * output_a; + struct ggml_tensor * output_b; + + std::vector layers; +}; + +// gguf constants +static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; +static const char * LLM_KV_TRAINING_TYPE = "training.type"; + +static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"; +static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"; +static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"; + +// gguf constants (sync with gguf.py) + +static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; +static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; + +static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; +static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; +static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; +static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; +static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; +static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV = "%s.attention.head_count_kv"; +static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; +static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; +static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp +static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; + +static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; +static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; +static const char * LLM_TENSOR_OUTPUT = "output"; +static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; +static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; +static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; +static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; +static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; +static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; +static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; +static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; +static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; + +static void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab : %u\n", __func__, params->n_vocab); + printf("%s: n_ctx : %u\n", __func__, params->n_ctx); + printf("%s: n_embd : %u\n", __func__, params->n_embd); + printf("%s: n_ff : %u\n", __func__, params->n_ff); + printf("%s: n_head : %u\n", __func__, params->n_head); + printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); + printf("%s: n_layer : %u\n", __func__, params->n_layer); + printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); + printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); + printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); +} + +static void print_lora_params(struct my_llama_lora_hparams * params) { + printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm); + printf("%s: n_rank_wq : %u\n", __func__, params->n_rank_wq); + printf("%s: n_rank_wk : %u\n", __func__, params->n_rank_wk); + printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); + printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); + printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); + printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); + printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); + printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); + printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); + printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); + printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); +} + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + die_fmt("key not found in model: %s", skey.c_str()); \ + } \ +} + +static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) { + std::string arch; + + GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); + if (expected_arch != NULL) { + if (arch != expected_arch) { + printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch); + } + GGML_ASSERT(arch == expected_arch); + } + + std::vector keybuf; + keybuf.resize(512); + auto kv = [&arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); + return keybuf.data(); + }; + + GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + + // n_head_kv is optional, default to n_head + hparams->n_head_kv = hparams->n_head; + GGUF_GET_KEY(ctx, hparams->n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + + float rope_freq_scale = 1.0f; + GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + if (rope_freq_scale != 1.0f) { + hparams->rope_freq_scale = 1.0f / rope_freq_scale; + } +} + +static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) { + auto & hparams = model->hparams; + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); + return tn_buf.data(); + }; + + + // get parameters directly from gguf file + { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ NULL, + }; + struct gguf_context * mctx = gguf_init_from_file(fn_model, params); + + load_model_hparams_gguf(mctx, &hparams, "llama"); + + gguf_free(mctx); + } + hparams.n_vocab = llama_n_vocab(input); + hparams.n_ctx = n_ctx; + + // get tensors from llama_model (possibly mmapped) + model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD)); + model->norm = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM)); + model->output = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT)); + + assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab); + assert_shape_1d(model->norm, hparams.n_embd); + assert_shape_2d(model->output, hparams.n_embd, hparams.n_vocab); + + model->layers.resize(hparams.n_layer); + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i)); + layer.wq = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i)); + layer.wk = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i)); + layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); + layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); + layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); + layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); + layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); + layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); + + assert_shape_1d(layer.attention_norm, hparams.n_embd); + assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); + assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa()); + assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); + assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); + assert_shape_1d(layer.ffn_norm, hparams.n_embd); + assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); + assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); + } +} + +static void set_param_lora(struct my_llama_lora * lora) { + const uint32_t n_layer = lora->layers.size(); + + struct ggml_context* ctx = lora->ctx; + + ggml_set_param(ctx, lora->tok_embeddings_a); + ggml_set_param(ctx, lora->tok_embeddings_b); + ggml_set_param(ctx, lora->norm_a); + ggml_set_param(ctx, lora->norm_b); + ggml_set_param(ctx, lora->output_a); + ggml_set_param(ctx, lora->output_b); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = lora->layers[i]; + + ggml_set_param(ctx, layer.attention_norm_a); + ggml_set_param(ctx, layer.attention_norm_b); + ggml_set_param(ctx, layer.wq_a); + ggml_set_param(ctx, layer.wq_b); + ggml_set_param(ctx, layer.wk_a); + ggml_set_param(ctx, layer.wk_b); + ggml_set_param(ctx, layer.wv_a); + ggml_set_param(ctx, layer.wv_b); + ggml_set_param(ctx, layer.wo_a); + ggml_set_param(ctx, layer.wo_b); + ggml_set_param(ctx, layer.ffn_norm_a); + ggml_set_param(ctx, layer.ffn_norm_b); + ggml_set_param(ctx, layer.ffn_gate_a); + ggml_set_param(ctx, layer.ffn_gate_b); + ggml_set_param(ctx, layer.ffn_down_a); + ggml_set_param(ctx, layer.ffn_down_b); + ggml_set_param(ctx, layer.ffn_up_a); + ggml_set_param(ctx, layer.ffn_up_b); + } +} + +static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { + const auto & lparams = lora->hparams; + + const uint32_t n_embd = model->hparams.n_embd; + const uint32_t n_embd_gqa = model->hparams.n_embd_gqa(); + const uint32_t n_layer = model->hparams.n_layer; + const uint32_t n_vocab = model->hparams.n_vocab; + const uint32_t n_ff = model->hparams.n_ff; + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); + return tn_buf.data(); + }; + + // context for lora tensors without their data + struct ggml_init_params ctx_lora_params; + ctx_lora_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); + ctx_lora_params.mem_buffer = NULL; + ctx_lora_params.no_alloc = true; + + struct ggml_context * ctx = ggml_init(ctx_lora_params); + lora->ctx = ctx; + + lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd); + lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab); + lora->norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd); + lora->norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1); + lora->output_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd); + lora->output_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab); + + ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_a")); + ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_b")); + ggml_set_name(lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a")); + ggml_set_name(lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b")); + ggml_set_name(lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.lora_a")); + ggml_set_name(lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.lora_b")); + + lora->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = lora->layers[i]; + + layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd); + layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1); + + layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); + layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); + layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd); + layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa); + layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd); + layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa); + layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); + layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); + + layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); + layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); + + layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); + layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); + layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); + layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); + layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); + layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); + + ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); + ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); + ggml_set_name(layer.wq_a, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_a", i)); + ggml_set_name(layer.wq_b, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_b", i)); + ggml_set_name(layer.wk_a, tni(LLM_TENSOR_ATTN_K, ".weight.lora_a", i)); + ggml_set_name(layer.wk_b, tni(LLM_TENSOR_ATTN_K, ".weight.lora_b", i)); + ggml_set_name(layer.wv_a, tni(LLM_TENSOR_ATTN_V, ".weight.lora_a", i)); + ggml_set_name(layer.wv_b, tni(LLM_TENSOR_ATTN_V, ".weight.lora_b", i)); + ggml_set_name(layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_a", i)); + ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); + } + + set_param_lora(lora); + + // allocate data for lora tensors + lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); +} + +static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { + const uint32_t n_layer = lora->layers.size(); + + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(lora->tok_embeddings_a, rnd); + ggml_set_zero(lora->tok_embeddings_b); + randomize_tensor_normal(lora->norm_a, rnd); + ggml_set_zero(lora->norm_b); + randomize_tensor_normal(lora->output_a, rnd); + ggml_set_zero(lora->output_b); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = lora->layers[i]; + randomize_tensor_normal(layer.attention_norm_a, rnd); + ggml_set_zero(layer.attention_norm_b); + + randomize_tensor_normal(layer.wq_a, rnd); + ggml_set_zero(layer.wq_b); + randomize_tensor_normal(layer.wk_a, rnd); + ggml_set_zero(layer.wk_b); + randomize_tensor_normal(layer.wv_a, rnd); + ggml_set_zero(layer.wv_b); + randomize_tensor_normal(layer.wo_a, rnd); + ggml_set_zero(layer.wo_b); + + randomize_tensor_normal(layer.ffn_norm_a, rnd); + ggml_set_zero(layer.ffn_norm_b); + + randomize_tensor_normal(layer.ffn_gate_a, rnd); + ggml_set_zero(layer.ffn_gate_b); + randomize_tensor_normal(layer.ffn_down_a, rnd); + ggml_set_zero(layer.ffn_down_b); + randomize_tensor_normal(layer.ffn_up_a, rnd); + ggml_set_zero(layer.ffn_up_b); + } + + free_random_normal_distribution(rnd); +} + +static struct ggml_tensor * llama_build_lora_finetune_graphs( + struct my_llama_model * model, + struct my_llama_lora * lora, + ggml_gallocr_t alloc, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * logits, + struct ggml_tensor * tokens_input, + struct ggml_tensor * targets, + const int n_tokens, + const int n_batch, + const bool enable_flash_attn, + const bool enable_checkpointing, + const bool measure_only) { + + ggml_set_scratch(ctx, { 0, 0, nullptr, }); + const int n_past = 0; + const int N = n_tokens; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_head_kv = hparams.n_head_kv; + const int n_ff = hparams.n_ff; + const int n_rot = hparams.n_embd_head(); + const int n_embd_head = hparams.n_embd_head(); + const int n_embd_gqa = hparams.n_embd_gqa(); + + const float rms_norm_eps = hparams.f_norm_rms_eps; + const float rope_freq_base = hparams.rope_freq_base; + const float rope_freq_scale = hparams.rope_freq_scale; + + GGML_ASSERT((size_t) n_layer == lora->layers.size()); + + auto set_name = [](struct ggml_tensor * t, const char * n) { + ggml_set_name(t, n); + if (t->grad) { + ggml_format_name(t->grad, "%s->grad", n); + } + }; + + // KQ_pos - contains the positions + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); + ggml_set_input(KQ_pos); + + // rope has so much parameters that we make a custom function for it + auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + (struct ggml_tensor * t) -> struct ggml_tensor * { + // not capturing these, to silcence warnings + const int rope_mode = 0; + + return ggml_rope_ext(ctx, + t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, + rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f + ); + }; + + set_name(tokens_input, "tokens_input"); + set_name(targets, "targets"); + + GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); + + auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { + if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) { + return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); + } else if (a->type == GGML_TYPE_F32) { + return ggml_add(ctx, a, b); + } else { + die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n", + __func__, ggml_type_name(a->type)); + } + }; + + struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b)); + struct ggml_tensor * norm = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b)); + struct ggml_tensor * output = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b)); + + struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); + struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); + + struct ggml_tensor * cur = t01; + + std::vector checkpoints; + if (enable_checkpointing) { + checkpoints.push_back(tokens_input); + checkpoints.push_back(targets); + checkpoints.push_back(t00); + checkpoints.push_back(t01); + } + + const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); + + for (int il = 0; il < n_layer; ++il) { + struct my_llama_layer & layer = model->layers[il]; + struct my_llama_lora_layer & llayer = lora->layers[il]; + + struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); + struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); + struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); + struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); + struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); + struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); + struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); + struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); + struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); + + struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); + struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); + struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); + struct ggml_tensor * t05 = ggml_mul_mat (ctx, wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); + struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t08 = ggml_mul_mat (ctx, wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd_gqa, N*n_batch); + struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch); + struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch); + + struct ggml_tensor * t11; + if (ggml_is_quantized(wv->type)) { + struct ggml_tensor * t11_1 = ggml_mul_mat (ctx, wv, t04); set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch); + struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1); set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa); + t11 = ggml_cont (ctx, t11_2); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); + } else { + t11 = ggml_mul_mat (ctx, t04, wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); + } + + struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv); + struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd_head, N, n_head, n_batch); + struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch); + struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); + struct ggml_tensor * t16; + if (enable_flash_attn) { + GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); + //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); + } else { + struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); + struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); + struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); + struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); + t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); + } + struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); + struct ggml_tensor * t20 = ggml_mul_mat (ctx, wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); + struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); + struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); + struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); + struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); + struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); + cur = t30; + if (enable_checkpointing) { + checkpoints.push_back(cur); + } + } + struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); + struct ggml_tensor * t32 = ggml_repeat (ctx, norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); + struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); + struct ggml_tensor * t34 = ggml_mul_mat (ctx, output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); + struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); + struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); + + if (enable_checkpointing) { + checkpoints.push_back(t31); + checkpoints.push_back(t32); + checkpoints.push_back(t33); + checkpoints.push_back(t34); + checkpoints.push_back(t35); + checkpoints.push_back(t36); + } + + ggml_build_forward_expand(gf, t36); + + if (enable_checkpointing) { + ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); + } else { + ggml_graph_cpy(gf, gb); + ggml_build_backward_expand(ctx, gf, gb, true); + } + + GGML_ASSERT(alloc != NULL); + + // make sure some tensors are not reallocated by inserting new temporary nodes depending on them + int n_leafs_before = gb->n_leafs; + int n_nodes_before = gb->n_nodes; + + // output tensors + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); + // input gradient + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); + GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); + ggml_set_input(t36->grad); + // KQ_pos + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); + + // make sure base model tensors data cannot be used in viewable operations + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); + for (int il = 0; il < n_layer; ++il) { + struct my_llama_layer & layer = model->layers[il]; + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); + } + + // allocating checkpoints in one block to reduce memory fragmentation + // note: they will be freed in reverse order + for (unsigned int i = 0; i < checkpoints.size(); ++i) { + if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { + ggml_set_input(checkpoints[i]); + } + } + + if (measure_only) { + ggml_gallocr_reserve(alloc, gb); + } else { + ggml_gallocr_alloc_graph(alloc, gb); + + // set KQ_pos + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + } + + // remove the additional nodes and leafs + for (int i = n_leafs_before; i < gb->n_leafs; ++i) { + gb->leafs[i] = NULL; + } + for (int i = n_nodes_before; i < gb->n_nodes; ++i) { + gb->nodes[i] = NULL; + } + gb->n_leafs = n_leafs_before; + gb->n_nodes = n_nodes_before; + + *logits = t35; + return t36; +} + +static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + std::string arch; + + std::vector keybuf; + keybuf.resize(512); + + GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); + GGML_ASSERT(arch == "llama"); + + uint32_t ftype_u; + GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); + GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); + + struct my_llama_hparams hparams; + load_model_hparams_gguf(fctx, &hparams, arch.c_str()); + + // parameters that define tensor shapes must match + GGML_ASSERT(hparams.n_embd == model->hparams.n_embd); + GGML_ASSERT(hparams.n_ff == model->hparams.n_ff); + GGML_ASSERT(hparams.n_head == model->hparams.n_head); + GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv); + GGML_ASSERT(hparams.n_layer == model->hparams.n_layer); + + GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_output, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); + + init_lora(model, lora); + + copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); + copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); + copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); + copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); + copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); + copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); + + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); + copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); + copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); + copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); + copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); + copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); + copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); + copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); + copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); + copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); + copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); + copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); + copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); + copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); + copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); + copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); + copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); + copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); + } +} + +static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) { + const char * arch = "llama"; + enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + + std::vector keybuf; + keybuf.resize(512); + auto kv = [arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch); + return keybuf.data(); + }; + + gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); + gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); + + gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx); + gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd); + gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff); + gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head); + gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV), model->hparams.n_head_kv); + gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer); + gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_embd_head()); + gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps); + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base); + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), model->hparams.rope_freq_scale); + + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, lora->hparams.n_rank_tok_embeddings); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, lora->hparams.n_rank_norm); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT, lora->hparams.n_rank_output); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, lora->hparams.n_rank_attention_norm); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q, lora->hparams.n_rank_wq); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K, lora->hparams.n_rank_wk); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); + + gguf_add_tensor(fctx, lora->tok_embeddings_a); + gguf_add_tensor(fctx, lora->tok_embeddings_b); + gguf_add_tensor(fctx, lora->norm_a); + gguf_add_tensor(fctx, lora->norm_b); + gguf_add_tensor(fctx, lora->output_a); + gguf_add_tensor(fctx, lora->output_b); + + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + + gguf_add_tensor(fctx, layer.attention_norm_a); + gguf_add_tensor(fctx, layer.attention_norm_b); + gguf_add_tensor(fctx, layer.wq_a); + gguf_add_tensor(fctx, layer.wq_b); + gguf_add_tensor(fctx, layer.wk_a); + gguf_add_tensor(fctx, layer.wk_b); + gguf_add_tensor(fctx, layer.wv_a); + gguf_add_tensor(fctx, layer.wv_b); + gguf_add_tensor(fctx, layer.wo_a); + gguf_add_tensor(fctx, layer.wo_b); + gguf_add_tensor(fctx, layer.ffn_norm_a); + gguf_add_tensor(fctx, layer.ffn_norm_b); + gguf_add_tensor(fctx, layer.ffn_gate_a); + gguf_add_tensor(fctx, layer.ffn_gate_b); + gguf_add_tensor(fctx, layer.ffn_down_a); + gguf_add_tensor(fctx, layer.ffn_down_b); + gguf_add_tensor(fctx, layer.ffn_up_a); + gguf_add_tensor(fctx, layer.ffn_up_b); + } +} + +static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA; + GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); + GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA); + + load_train_state_gguf(fctx, f_ggml_ctx, train); + load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora); +} + +static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA); + save_llama_lora_gguf(fctx, model, lora); + save_train_state_gguf(fctx, train); +} + +static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + struct ggml_context * f_ggml_ctx; + struct gguf_init_params params; + params.no_alloc = false; + params.ctx = &f_ggml_ctx; + struct gguf_context * fctx = gguf_init_from_file(filename, params); + if (fctx == NULL) { + return false; + } + + load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train); + + gguf_free(fctx); + return true; +} + +static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + printf("%s: saving to %s\n", __func__, filename); + struct gguf_context * fctx = gguf_init_empty(); + + save_checkpoint_lora_gguf(fctx, model, lora, train); + + // write file + const bool only_meta = false; + gguf_write_to_file(fctx, filename, only_meta); + gguf_free(fctx); +} + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) { + if (tensor == NULL) { + file->write_u32(0); + file->write_u32(0); + file->write_u32(GGML_TYPE_F32); + file->seek((0-file->tell()) & 31, SEEK_CUR); + return; + } + if (name == NULL) { + name = ggml_get_name(tensor); + } + uint32_t name_len = strlen(name); + uint32_t nd = ggml_n_dims(tensor); + uint32_t ne[4] = { (uint32_t)tensor->ne[0], + (uint32_t)tensor->ne[1], + (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; + file->write_u32(nd); + file->write_u32(name_len); + file->write_u32(tensor->type); + file->write_raw(ne, sizeof(ne[0]) * nd); + file->write_raw(name, name_len); + file->seek((0-file->tell()) & 31, SEEK_CUR); + file->write_raw(tensor->data, ggml_nbytes(tensor)); +} + +static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) { + printf("%s: saving to %s\n", __func__, filename); + struct llama_file file(filename, "wb"); + if (file.fp == NULL) { + return; + } + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + + auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); + return tn_buf.data(); + }; + + auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); + return tn_buf.data(); + }; + + // write_magic + file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic + file.write_u32(1); // version + // write_hparams + file.write_u32(lora->hparams.lora_r); + file.write_u32(lora->hparams.lora_alpha); + // write tensors + write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraA")); + write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraB")); + write_tensor(&file, lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA")); + write_tensor(&file, lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB")); + write_tensor(&file, lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.loraA")); + write_tensor(&file, lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.loraB")); + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA")); + write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB")); + write_tensor(&file, layer.wq_a, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraA")); + write_tensor(&file, layer.wq_b, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraB")); + write_tensor(&file, layer.wk_a, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraA")); + write_tensor(&file, layer.wk_b, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraB")); + write_tensor(&file, layer.wv_a, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraA")); + write_tensor(&file, layer.wv_b, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraB")); + write_tensor(&file, layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraA")); + write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); + } +} + +struct train_params { + struct train_params_common common; + + const char * fn_model_base; + const char * fn_lora_out; + + bool only_write_lora; + + float f_norm_rms_eps; + float rope_freq_base; + float rope_freq_scale; + + bool custom_f_norm_rms_eps; + bool custom_rope_freq_base; + bool custom_rope_freq_scale; + + int32_t lora_r; + int32_t lora_alpha; + bool custom_lora_alpha; + + uint32_t n_rank_attention_norm; + uint32_t n_rank_wq; + uint32_t n_rank_wk; + uint32_t n_rank_wv; + uint32_t n_rank_wo; + uint32_t n_rank_ffn_norm; + uint32_t n_rank_ffn_gate; + uint32_t n_rank_ffn_down; + uint32_t n_rank_ffn_up; + uint32_t n_rank_tok_embeddings; + uint32_t n_rank_norm; + uint32_t n_rank_output; + + bool custom_n_rank_attention_norm; + bool custom_n_rank_wq; + bool custom_n_rank_wk; + bool custom_n_rank_wv; + bool custom_n_rank_wo; + bool custom_n_rank_ffn_norm; + bool custom_n_rank_ffn_gate; + bool custom_n_rank_ffn_down; + bool custom_n_rank_ffn_up; + bool custom_n_rank_tok_embeddings; + bool custom_n_rank_norm; + bool custom_n_rank_output; +}; + +static struct train_params get_default_train_params() { + struct train_params params; + params.common = get_default_train_params_common(); + params.fn_model_base = ""; + params.fn_lora_out = "ggml-lora-ITERATION-f32.gguf"; + + params.only_write_lora = false; + + params.f_norm_rms_eps = 1e-5f; + params.rope_freq_base = 10000.0f; + params.rope_freq_scale = 1.0f; + + params.custom_f_norm_rms_eps = false; + params.custom_rope_freq_base = false; + params.custom_rope_freq_scale = false; + + params.lora_r = 4; + params.lora_alpha = 4; + params.custom_lora_alpha = false; + + params.n_rank_attention_norm = 1; + params.n_rank_wq = 4; + params.n_rank_wk = 4; + params.n_rank_wv = 4; + params.n_rank_wo = 4; + params.n_rank_ffn_norm = 1; + params.n_rank_ffn_gate = 4; + params.n_rank_ffn_down = 4; + params.n_rank_ffn_up = 4; + params.n_rank_tok_embeddings = 4; + params.n_rank_norm = 1; + params.n_rank_output = 4; + + params.custom_n_rank_attention_norm = false; + params.custom_n_rank_wq = false; + params.custom_n_rank_wk = false; + params.custom_n_rank_wv = false; + params.custom_n_rank_wo = false; + params.custom_n_rank_ffn_norm = false; + params.custom_n_rank_ffn_gate = false; + params.custom_n_rank_ffn_down = false; + params.custom_n_rank_ffn_up = false; + params.custom_n_rank_tok_embeddings = false; + params.custom_n_rank_norm = false; + params.custom_n_rank_output = false; + + return params; +} + +static void train_print_usage(int argc, char ** argv, const struct train_params * params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + + fprintf(stderr, " --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base); + fprintf(stderr, " --lora-out FNAME path to save llama lora (default '%s')\n", params->fn_lora_out); + fprintf(stderr, " --only-write-lora only save llama lora, don't do any training. use this if you only want to convert a checkpoint to a lora adapter.\n"); + fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); + fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); + fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); + fprintf(stderr, " --lora-alpha N LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha); + fprintf(stderr, " --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r); + fprintf(stderr, " --rank-att-norm N LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); + fprintf(stderr, " --rank-ffn-norm N LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); + fprintf(stderr, " --rank-out-norm N LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); + fprintf(stderr, " --rank-tok-embd N LORA rank for token embeddings tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-out N LORA rank for output tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wq N LORA rank for wq tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); + + print_common_train_usage(argc, argv, ¶ms->common); +} + +static bool train_params_parse(int argc, char ** argv, struct train_params * params) { + bool invalid_param = false; + std::string arg; + struct train_params default_params = get_default_train_params(); + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { + if (invalid_param) { + break; + } else if (params->common.print_usage) { + train_print_usage(argc, argv, &default_params); + exit(0); + } + } else if (arg == "--model-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_model_base = argv[i]; + } else if (arg == "--lora-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_lora_out = argv[i]; + } else if (arg == "--only-write-lora") { + params->only_write_lora = true; + } else if (arg == "--norm-rms-eps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->f_norm_rms_eps = std::stof(argv[i]); + params->custom_f_norm_rms_eps = true; + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->rope_freq_base = std::stof(argv[i]); + params->custom_rope_freq_base = true; + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->rope_freq_scale = std::stof(argv[i]); + params->custom_rope_freq_scale = true; + } else if (arg == "--lora-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lora_alpha = std::stoi(argv[i]); + params->custom_lora_alpha = true; + } else if (arg == "--lora-r") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lora_r = std::stoi(argv[i]); + } else if (arg == "--rank-att-norm") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_attention_norm = std::stoi(argv[i]); + params->custom_n_rank_attention_norm = true; + } else if (arg == "--rank-ffn-norm") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_ffn_norm = std::stoi(argv[i]); + params->custom_n_rank_ffn_norm = true; + } else if (arg == "--rank-out-norm") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_norm = std::stoi(argv[i]); + params->custom_n_rank_norm = true; + } else if (arg == "--rank-tok-embd") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_tok_embeddings = std::stoi(argv[i]); + params->custom_n_rank_tok_embeddings = true; + } else if (arg == "--rank-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_output = std::stoi(argv[i]); + params->custom_n_rank_output = true; + } else if (arg == "--rank-wq") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wq = std::stoi(argv[i]); + params->custom_n_rank_wq = true; + } else if (arg == "--rank-wk") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wk = std::stoi(argv[i]); + params->custom_n_rank_wk = true; + } else if (arg == "--rank-wv") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wv = std::stoi(argv[i]); + params->custom_n_rank_wv = true; + } else if (arg == "--rank-wo") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wo = std::stoi(argv[i]); + params->custom_n_rank_wo = true; + } else if (arg == "--rank-ffn_gate") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_ffn_gate = std::stoi(argv[i]); + params->custom_n_rank_ffn_gate = true; + } else if (arg == "--rank-ffn_down") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_ffn_down = std::stoi(argv[i]); + params->custom_n_rank_ffn_down = true; + } else if (arg == "--rank-ffn_up") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_ffn_up = std::stoi(argv[i]); + params->custom_n_rank_ffn_up = true; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + finish_processing_train_args(¶ms->common); + return true; +} + +struct save_train_files_data { + const char * fn_checkpoint_out; + const char * fn_lora_out; + const char * pattern_fn_it; + const char * fn_latest; + struct my_llama_model * model; + struct my_llama_lora * lora; +}; + +static void save_train_files(void * vdata, struct train_state * train) { + struct save_train_files_data * data = (struct save_train_files_data *) vdata; + + int64_t iter = train->opt->iter; + + if (strlen(data->fn_checkpoint_out) > 0) { + save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train); + save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->model, data->lora, train); + } + if (strlen(data->fn_lora_out) > 0) { + save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora); + save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->lora); + } +} + +static int64_t get_parameter_count(struct my_llama_lora* lora) { + int64_t nx = 0; + nx += ggml_nelements(lora->tok_embeddings_a); + nx += ggml_nelements(lora->tok_embeddings_b); + nx += ggml_nelements(lora->norm_a); + nx += ggml_nelements(lora->norm_b); + nx += ggml_nelements(lora->output_a); + nx += ggml_nelements(lora->output_b); + + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + nx += ggml_nelements(layer.attention_norm_a); + nx += ggml_nelements(layer.attention_norm_b); + nx += ggml_nelements(layer.wq_a); + nx += ggml_nelements(layer.wq_b); + nx += ggml_nelements(layer.wk_a); + nx += ggml_nelements(layer.wk_b); + nx += ggml_nelements(layer.wv_a); + nx += ggml_nelements(layer.wv_b); + nx += ggml_nelements(layer.wo_a); + nx += ggml_nelements(layer.wo_b); + nx += ggml_nelements(layer.ffn_norm_a); + nx += ggml_nelements(layer.ffn_norm_b); + nx += ggml_nelements(layer.ffn_gate_a); + nx += ggml_nelements(layer.ffn_gate_b); + nx += ggml_nelements(layer.ffn_down_a); + nx += ggml_nelements(layer.ffn_down_b); + nx += ggml_nelements(layer.ffn_up_a); + nx += ggml_nelements(layer.ffn_up_b); + } + return nx; +} + +int main(int argc, char ** argv) { + struct train_params params = get_default_train_params(); + + if (!train_params_parse(argc, argv, ¶ms)) { + return 1; + } + + if (params.common.seed == LLAMA_DEFAULT_SEED) { + params.common.seed = time(NULL); + } + printf("%s: seed: %u\n", __func__, params.common.seed); + srand(params.common.seed); + + struct llama_model_params llama_mparams = llama_model_default_params(); + llama_mparams.n_gpu_layers = params.common.n_gpu_layers; + llama_mparams.vocab_only = false; + + printf("%s: model base = '%s'\n", __func__, params.fn_model_base); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams); + + struct llama_context_params llama_cparams = llama_context_default_params(); + struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams); + + struct my_llama_model model; + init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx); + + struct my_llama_lora lora; + + struct train_state * train = init_train_state(); + struct ggml_opt_context * opt = train->opt; + + // set params from command line + if (params.custom_f_norm_rms_eps) { + model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; + } + if (params.custom_rope_freq_base) { + model.hparams.rope_freq_base = params.rope_freq_base; + } + if (params.custom_rope_freq_scale) { + model.hparams.rope_freq_scale = params.rope_freq_scale; + } + lora.hparams.lora_r = params.lora_r; + lora.hparams.lora_alpha = params.custom_lora_alpha ? params.lora_alpha : params.lora_r; + uint32_t n_rank_attention_norm = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1; + uint32_t n_rank_wq = params.custom_n_rank_wq ? params.n_rank_wq : params.lora_r; + uint32_t n_rank_wk = params.custom_n_rank_wk ? params.n_rank_wk : params.lora_r; + uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; + uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; + uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; + uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; + uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; + uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; + uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; + uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; + uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; + lora.hparams.n_rank_attention_norm = n_rank_attention_norm; + lora.hparams.n_rank_wq = n_rank_wq; + lora.hparams.n_rank_wk = n_rank_wk; + lora.hparams.n_rank_wv = n_rank_wv; + lora.hparams.n_rank_wo = n_rank_wo; + lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; + lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; + lora.hparams.n_rank_ffn_down = n_rank_ffn_down; + lora.hparams.n_rank_ffn_up = n_rank_ffn_up; + lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; + lora.hparams.n_rank_norm = n_rank_norm; + lora.hparams.n_rank_output = n_rank_output; + + // set opt params from command line + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); + opt->params.print_forward_graph = false; + opt->params.print_backward_graph = false; + opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; + opt->params.n_threads = params.common.n_threads; + opt->params.past = params.common.opt_past; + opt->params.delta = params.common.opt_delta; + opt->params.max_no_improvement = params.common.opt_max_no_improvement; + opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; + opt->params.adam.n_iter = params.common.adam_n_iter; + opt->params.adam.sched = 1.0f; + opt->params.adam.alpha = params.common.adam_alpha; + opt->params.adam.decay = params.common.adam_decay; + opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; + opt->params.adam.beta1 = params.common.adam_beta1; + opt->params.adam.beta2 = params.common.adam_beta2; + opt->params.adam.gclip = params.common.adam_gclip; + opt->params.adam.eps_f = params.common.adam_eps_f; + + printf("%s: init model\n", __func__); + bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); + + if (existed) { + // overwrite last n_ctx with user provided n_ctx + if (params.common.custom_n_ctx) { + model.hparams.n_ctx = params.common.n_ctx; + } + + const bool opt_param_count_changed = ( + (lora.hparams.n_rank_attention_norm != n_rank_attention_norm) + || (lora.hparams.n_rank_wq != n_rank_wq) + || (lora.hparams.n_rank_wk != n_rank_wk) + || (lora.hparams.n_rank_wv != n_rank_wv) + || (lora.hparams.n_rank_wo != n_rank_wo) + || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) + || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) + || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) + || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) + || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) + || (lora.hparams.n_rank_norm != n_rank_norm) + || (lora.hparams.n_rank_output != n_rank_output) + ); + + const bool opt_past_changed = opt->params.past != params.common.opt_past; + + if (opt_param_count_changed) { + print_lora_params(&lora.hparams); + die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting."); + // need to discard previous optimizer gradient statistics and opt_init with new shapes + // TODO + } + if (opt_past_changed) { + die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); + // need to discard previous optimizer past function value statistics and opt_init with new shapes + // TODO + } + } else { // existed == false + init_lora(&model, &lora); + randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); + if (!params.only_write_lora) { + ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora)); + } + } + opt->iter = train->train_its; + + print_params(&model.hparams); + print_lora_params(&lora.hparams); + printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); + printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); + printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); + printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); + printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); + + if (params.only_write_lora) { + save_train_files_data save_data; + save_data.fn_checkpoint_out = ""; + save_data.fn_lora_out = params.fn_lora_out; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + save_data.lora = &lora; + + save_train_files(&save_data, train); + + free_train_state(train); + ggml_free(lora.ctx); + llama_free(lctx); + llama_free_model(lmodel); + return 0; + } + + printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); + printf("%s: opt iter %d\n", __func__, opt->iter); + + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; + int n_batch = params.common.n_batch; + + // context for input tensors without their data + struct ggml_init_params ctx_input_params = { + ggml_tensor_overhead() * 2, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_input = ggml_init(ctx_input_params); + + // the input tensors + struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); + struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + + // allocate input tensors + // measure required memory for input tensors + ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); + size_t max_input_size = ggml_backend_buffer_get_size(input_data); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); + + // context for compute tensors without their data + const size_t estimated_compute_size_wo_data = ( + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + + (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) + ); + struct ggml_init_params ctx_compute_params = { + estimated_compute_size_wo_data, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_compute = NULL; + + struct ggml_tensor * loss = NULL; + struct ggml_tensor * logits = NULL; + + struct ggml_cgraph * gf = NULL; + struct ggml_cgraph * gb = NULL; + struct ggml_cgraph * gb_tmp = NULL; + + // measure required memory for compute tensors + size_t best_compute_size = SIZE_MAX; + enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; + // find best evaluation order + for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { + ctx_compute = ggml_init(ctx_compute_params); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); + gf->order = (enum ggml_cgraph_eval_order) order; + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) + : NULL; + loss = llama_build_lora_finetune_graphs( + &model, &lora, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.common.use_flash, + params.common.use_checkpointing, + true + ); + size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer + if (max_compute_size < best_compute_size) { + best_compute_size = max_compute_size; + best_order = gf->order; + } + ggml_gallocr_free(alloc); + ggml_free(ctx_compute); + } + size_t max_compute_size = best_compute_size; + printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); + printf("%s: evaluation order = %s\n", __func__, + (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : + (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : + "invalid"); + + // allocate compute tensors + ctx_compute = ggml_init(ctx_compute_params); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); + gf->order = best_order; + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) + : NULL; + loss = llama_build_lora_finetune_graphs( + &model, &lora, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.common.use_flash, + params.common.use_checkpointing, + false + ); + + // tokenize data + std::vector train_tokens; + std::vector train_samples_begin; + std::vector train_samples_size; + printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); + printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); + printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); + tokenize_file(lctx, + params.common.fn_train_data, + params.common.sample_start, + params.common.include_sample_start, + params.common.overlapping_samples, + n_tokens, + train_tokens, + train_samples_begin, + train_samples_size); + GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); + + printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); + + std::vector token_noccurs; + token_noccurs.resize(model.hparams.n_vocab, 0); + for (unsigned int i = 0; i < train_tokens.size(); ++i) { + ++token_noccurs[train_tokens[i]]; + } + int n_unique_tokens = 0; + for (unsigned int i = 0; i < token_noccurs.size(); ++i) { + if (token_noccurs[i] == 0) continue; + ++n_unique_tokens; + } + printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); + + size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); + const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); + if (changed_train_data) { + printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); + } + if (params.common.force_reshuffle) { + printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); + } + if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { + train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); + train->shuffle_sample_count = train_samples_size.size(); + train->shuffle_next_sample = 0; + train->shuffle_samples_hash = shuffle_samples_hash; + } + std::vector train_shuffled_samples_offs; + std::vector train_shuffled_samples_begin; + std::vector train_shuffled_samples_size; + train_shuffled_samples_offs.resize(train_samples_begin.size()); + train_shuffled_samples_begin.resize(train_samples_begin.size()); + train_shuffled_samples_size.resize(train_samples_size.size()); + train->shuffle_rng_state_next = shuffle_samples( + train->shuffle_rng_state_current, + train_shuffled_samples_offs.data(), + train_shuffled_samples_begin.data(), + train_shuffled_samples_size.data(), + train_samples_begin.data(), + train_samples_size.data(), + train_samples_size.size()); + + printf("%s: begin training\n", __func__); + + save_train_files_data save_data; + save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; + save_data.fn_lora_out = params.fn_lora_out; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + save_data.lora = &lora; + + struct train_opt_callback_data opt_cb_data; + opt_cb_data.params = ¶ms.common; + opt_cb_data.train = train; + opt_cb_data.save_cb = &save_train_files; + opt_cb_data.save_data = &save_data; + opt_cb_data.lctx = lctx; + opt_cb_data.last_save_iter = opt->iter; + opt_cb_data.tokens_data = train_tokens.data(); + opt_cb_data.tokens_size = train_tokens.size(); + opt_cb_data.samples_begin = train_samples_begin.data(); + opt_cb_data.samples_size = train_samples_size.data(); + opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); + opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); + opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); + opt_cb_data.samples_count = train_samples_size.size(); + opt_cb_data.tokens_input = tokens_input; + opt_cb_data.target_probs = target_probs; + opt_cb_data.first_iter = opt->iter; + opt_cb_data.first_epoch = train->train_epochs; + opt_cb_data.iter_at_last_epoch = -1; + opt_cb_data.last_time = ggml_time_ms(); + opt_cb_data.millis_per_iter = 0.0; + + // measure required memory for work buffer + size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; + printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); + + // context for work buffer + struct ggml_init_params ctx_work_params = { + max_work_size, // mem_size + NULL, // mem_buffer + false, // no_alloc + }; + struct ggml_context * ctx_work = ggml_init(ctx_work_params); + + int64_t t0 = ggml_time_ms(); + + ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); + + ggml_free(ctx_work); + ggml_free(ctx_compute); + ggml_free(ctx_input); + ggml_gallocr_free(alloc); + + + int64_t t1 = ggml_time_ms(); + printf("%s: total training time: ", __func__); + print_duration((double) (t1 - t0)); + printf("\n"); + + int new_iters = opt->iter - opt_cb_data.last_save_iter; + if (new_iters > 0) { + train->train_its += new_iters; + train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; + + save_train_files(&save_data, train); + opt_cb_data.last_save_iter = opt->iter; + } + + ggml_free(opt->ctx); + free_train_state(train); + ggml_free(lora.ctx); + llama_free(lctx); + llama_free_model(lmodel); + return 0; +} diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh new file mode 100644 index 000000000..e3cc7f271 --- /dev/null +++ b/examples/finetune/finetune.sh @@ -0,0 +1,34 @@ +#!/bin/bash +cd `dirname $0` +cd ../.. + +EXE="./llama-finetune" + +if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi +if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi + +# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. +MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing. + +while getopts "dg" opt; do + case $opt in + d) + DEBUGGER="gdb --args" + ;; + g) + EXE="./build/bin/Release/finetune" + GPUARG="--gpu-layers 25" + ;; + esac +done + +$DEBUGGER $EXE \ + --model-base $MODEL \ + $GPUARG \ + --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ + --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ + --lora-out lora-ol3b-shakespeare-ITERATION.bin \ + --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ + --save-every 10 \ + --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ + --use-checkpointing diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt index d2cb524c0..4edd6ec73 100644 --- a/examples/gbnf-validator/CMakeLists.txt +++ b/examples/gbnf-validator/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator) add_executable(${TARGET} gbnf-validator.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index a610e6a0b..48a705e15 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -1,5 +1,9 @@ +#define LLAMA_API_INTERNAL + +#include "grammar-parser.h" +#include "ggml.h" +#include "llama.h" #include "unicode.h" -#include "llama-grammar.h" #include #include @@ -8,24 +12,29 @@ #include #include -static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { - const auto cpts = unicode_cpts_from_utf8(input_str); +static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { + auto decoded = decode_utf8(input_str, {}); + const auto & code_points = decoded.first; - auto & stacks_cur = llama_grammar_get_stacks(grammar); + const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); + llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); size_t pos = 0; - for (const auto & cpt : cpts) { - llama_grammar_accept(grammar, cpt); + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy - if (stacks_cur.empty()) { + llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); + + if (cur_stacks.empty()) { error_pos = pos; - error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; + error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; + cur_stacks = prev_stacks; return false; } ++pos; } - for (const auto & stack : stacks_cur) { + for (const auto & stack : cur_stacks) { if (stack.empty()) { return true; } @@ -76,11 +85,30 @@ int main(int argc, char** argv) { grammar_str = buffer.str(); } - llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0); - if (grammar == nullptr) { - fprintf(stdout, "Failed to initialize llama_grammar\n"); + // Parse the GBNF grammar + auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + fprintf(stdout, "%s: failed to parse grammar\n", __func__); return 1; } + + // Ensure that there is a "root" node. + if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) { + fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__); + return 1; + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + + // Create the LLAMA grammar + auto grammar = llama_grammar_init( + grammar_rules.data(), + grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } // Read the input file std::string input_str; { @@ -94,7 +122,7 @@ int main(int argc, char** argv) { // Validate the input string against the grammar size_t error_pos; std::string error_msg; - bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg); + bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg); if (is_valid) { fprintf(stdout, "Input string is valid according to the grammar.\n"); @@ -103,7 +131,7 @@ int main(int argc, char** argv) { } // Clean up - llama_grammar_free_impl(grammar); + llama_grammar_free(grammar); return 0; } diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt deleted file mode 100644 index 25de0af35..000000000 --- a/examples/gen-docs/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gen-docs) -add_executable(${TARGET} gen-docs.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp deleted file mode 100644 index 77c59a836..000000000 --- a/examples/gen-docs/gen-docs.cpp +++ /dev/null @@ -1,83 +0,0 @@ -#include "arg.h" -#include "common.h" - -#include -#include - -// Export usage message (-h) to markdown format - -static void write_table_header(std::ofstream & file) { - file << "| Argument | Explanation |\n"; - file << "| -------- | ----------- |\n"; -} - -static void write_table_entry(std::ofstream & file, const common_arg & opt) { - file << "| `"; - // args - for (const auto & arg : opt.args) { - if (arg == opt.args.front()) { - file << arg; - if (opt.args.size() > 1) file << ", "; - } else { - file << arg << (arg != opt.args.back() ? ", " : ""); - } - } - // value hint - if (opt.value_hint) { - std::string md_value_hint(opt.value_hint); - string_replace_all(md_value_hint, "|", "\\|"); - file << " " << md_value_hint; - } - if (opt.value_hint_2) { - std::string md_value_hint_2(opt.value_hint_2); - string_replace_all(md_value_hint_2, "|", "\\|"); - file << " " << md_value_hint_2; - } - // help text - std::string md_help(opt.help); - string_replace_all(md_help, "\n", "
"); - string_replace_all(md_help, "|", "\\|"); - file << "` | " << md_help << " |\n"; -} - -static void write_table(std::ofstream & file, std::vector & opts) { - write_table_header(file); - for (const auto & opt : opts) { - write_table_entry(file, *opt); - } -} - -static void export_md(std::string fname, llama_example ex) { - std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); - - common_params params; - auto ctx_arg = common_params_parser_init(params, ex); - - std::vector common_options; - std::vector sparam_options; - std::vector specific_options; - for (auto & opt : ctx_arg.options) { - // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example - if (opt.is_sparam) { - sparam_options.push_back(&opt); - } else if (opt.in_example(ctx_arg.ex)) { - specific_options.push_back(&opt); - } else { - common_options.push_back(&opt); - } - } - - file << "**Common params**\n\n"; - write_table(file, common_options); - file << "\n\n**Sampling params**\n\n"; - write_table(file, sparam_options); - file << "\n\n**Example-specific params**\n\n"; - write_table(file, specific_options); -} - -int main(int, char **) { - export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN); - export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); - - return 0; -} diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 15c5c68c6..633f45535 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -4,19 +4,12 @@ install(TARGETS ${TARGET} RUNTIME) # clibs dependencies include_directories(deps/) - add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h) target_link_libraries(${TARGET} PRIVATE xxhash) - add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h) target_link_libraries(${TARGET} PRIVATE sha1) -if (NOT MSVC) - # disable warnings in 3rd party code - target_compile_options(sha1 PRIVATE -w) -endif() - add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h) target_link_libraries(${TARGET} PRIVATE sha256) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122..e96c75117 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -1,5 +1,4 @@ #include "ggml.h" -#include "gguf.h" #include /* abort() */ #include diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt index c407e2f0a..f63887da7 100644 --- a/examples/gguf-split/CMakeLists.txt +++ b/examples/gguf-split/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index ef3ceb686..881f0451c 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,19 +1,18 @@ -#include "ggml.h" -#include "gguf.h" #include "llama.h" #include "common.h" #include -#include -#include -#include +#include #include -#include -#include #include #include #include +#include +#include +#include +#include + #if defined(_WIN32) #include #ifndef PATH_MAX @@ -23,20 +22,12 @@ #endif enum split_operation : uint8_t { - OP_NONE, - OP_SPLIT, - OP_MERGE, -}; - -enum split_mode : uint8_t { - MODE_NONE, - MODE_TENSOR, - MODE_SIZE, + SPLIT_OP_SPLIT, + SPLIT_OP_MERGE, }; struct split_params { - split_operation operation = OP_NONE; - split_mode mode = MODE_NONE; + split_operation operation = SPLIT_OP_SPLIT; size_t n_bytes_split = 0; int n_split_tensors = 128; std::string input; @@ -96,52 +87,59 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } bool arg_found = false; + bool is_op_set = false; + bool is_mode_set = false; if (arg == "-h" || arg == "--help") { split_print_usage(argv[0]); exit(0); - } else if (arg == "--version") { + } + if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } else if (arg == "--dry-run") { + } + if (arg == "--dry-run") { arg_found = true; params.dry_run = true; - } else if (arg == "--no-tensor-first-split") { + } + if (arg == "--no-tensor-first-split") { arg_found = true; params.no_tensor_first_split = true; - } else if (arg == "--merge") { + } + + if (is_op_set) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + if (arg == "--merge") { arg_found = true; - if (params.operation != OP_NONE && params.operation != OP_MERGE) { - throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); - } - params.operation = OP_MERGE; - } else if (arg == "--split") { + is_op_set = true; + params.operation = SPLIT_OP_MERGE; + } + if (arg == "--split") { arg_found = true; - if (params.operation != OP_NONE && params.operation != OP_SPLIT) { - throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); - } - params.operation = OP_SPLIT; - } else if (arg == "--split-max-tensors") { + is_op_set = true; + params.operation = SPLIT_OP_SPLIT; + } + + if (is_mode_set) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + if (arg == "--split-max-tensors") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; - if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) { - throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); - } - params.mode = MODE_TENSOR; + is_mode_set = true; params.n_split_tensors = atoi(argv[arg_idx]); - } else if (arg == "--split-max-size") { + } + if (arg == "--split-max-size") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; - if (params.mode != MODE_NONE && params.mode != MODE_SIZE) { - throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); - } - params.mode = MODE_SIZE; + is_mode_set = true; params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); } @@ -150,20 +148,11 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } } - // the operation is split if not specified - if (params.operation == OP_NONE) { - params.operation = OP_SPLIT; - } - // the split mode is by tensor if not specified - if (params.mode == MODE_NONE) { - params.mode = MODE_TENSOR; - } - if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); } - if (argc - arg_idx != 2) { + if (argc - arg_idx < 2) { throw std::invalid_argument("error: bad arguments"); } @@ -276,19 +265,17 @@ struct split_strategy { } bool should_split(int i_tensor, size_t next_size) { - if (params.mode == MODE_SIZE) { + if (params.n_bytes_split > 0) { // split by max size per file return next_size > params.n_bytes_split; - } else if (params.mode == MODE_TENSOR) { + } else { // split by number of tensors per file return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; } - // should never happen - GGML_ABORT("invalid mode"); } void print_info() { - printf("n_split: %zu\n", ctx_outs.size()); + printf("n_split: %ld\n", ctx_outs.size()); int i_split = 0; for (auto & ctx_out : ctx_outs) { // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) @@ -298,7 +285,7 @@ struct split_strategy { total_size += ggml_nbytes(t); } total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } } @@ -402,17 +389,10 @@ static void gguf_merge(const split_params & split_params) { int n_split = 1; int total_tensors = 0; - // avoid overwriting existing output file - if (std::ifstream(split_params.output.c_str())) { - fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str()); - exit(EXIT_FAILURE); - } - + auto * ctx_out = gguf_init_empty(); std::ofstream fout(split_params.output.c_str(), std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors - auto * ctx_out = gguf_init_empty(); - std::vector read_data; std::vector ctx_metas; std::vector ctx_ggufs; @@ -572,9 +552,9 @@ int main(int argc, const char ** argv) { split_params_parse(argc, argv, params); switch (params.operation) { - case OP_SPLIT: gguf_split(params); + case SPLIT_OP_SPLIT: gguf_split(params); break; - case OP_MERGE: gguf_merge(params); + case SPLIT_OP_MERGE: gguf_merge(params); break; default: split_print_usage(argv[0]); exit(EXIT_FAILURE); diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index 05a932227..d5a92d605 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -41,7 +41,7 @@ echo PASS echo # 2b. Test the sharded model is loading properly -$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -51,7 +51,7 @@ echo PASS echo # 3b. Test the merged model is loading properly -$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 echo PASS echo @@ -61,7 +61,7 @@ echo PASS echo # 4b. Test the sharded model is loading properly -$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 echo PASS echo @@ -71,7 +71,7 @@ echo #echo # 5b. Test the merged model is loading properly -#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 +#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 #echo PASS #echo @@ -81,7 +81,7 @@ echo PASS echo # 6b. Test the sharded model is loading properly -$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 echo PASS echo diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index fb04eb83f..a9569b411 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c..7498f85ef 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,9 +1,10 @@ #include "ggml.h" -#include "gguf.h" #include +#include #include #include +#include #include #undef MIN @@ -134,10 +135,9 @@ static bool gguf_ex_read_0(const std::string & fname) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); - const size_t size = gguf_get_tensor_size (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); } } @@ -182,10 +182,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); - const size_t size = gguf_get_tensor_size (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); } } @@ -200,8 +199,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n", - __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; @@ -217,7 +215,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { const float * data = (const float *) cur->data; for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { - fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i)); + fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); gguf_free(ctx); return false; } @@ -247,8 +245,6 @@ int main(int argc, char ** argv) { check_data = false; } - srand(123456); - const std::string fname(argv[1]); const std::string mode (argv[2]); diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt index fa1b4dc70..86dfddca3 100644 --- a/examples/gritlm/CMakeLists.txt +++ b/examples/gritlm/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gritlm) add_executable(${TARGET} gritlm.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 72eb46257..2c61c2e1e 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -1,4 +1,3 @@ -#include "arg.h" #include "common.h" #include "llama.h" @@ -10,26 +9,25 @@ static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { std::vector> result; - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); + const llama_model * mdl = llama_get_model(ctx); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { - common_batch_clear(batch); + llama_batch_clear(batch); const std::string input_string = instruction + sentences[i]; - std::vector inputs = common_tokenize(vocab, input_string, true, false); + std::vector inputs = llama_tokenize(mdl, input_string, true, false); const int32_t n_toks = inputs.size(); // GritLM seems to have EOS = "" // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 - // inputs.push_back(llama_vocab_eos(vocab)); + // inputs.push_back(llama_token_eos(mdl)); // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size(); + const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample @@ -41,7 +39,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // add input to batch (this increments n_tokens) for (int32_t j = 0; j < n_toks; j++) { - common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); } // clear previous kv_cache values (irrelevant for embeddings) @@ -53,7 +51,7 @@ static std::vector> encode(llama_context * ctx, const std::ve llama_decode(ctx, batch); // get embedding dimensions - uint64_t n_embd = llama_model_n_embd(model); + uint64_t n_embd = llama_n_embd(mdl); // allocate embedding output std::vector emb_unorm(n_embd, 0.0f); @@ -76,7 +74,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } std::vector emb_norm(emb_unorm.size()); - common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2); + llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); result.push_back(emb_norm); #ifdef GRIT_DEBUG @@ -94,13 +92,11 @@ static std::vector> encode(llama_context * ctx, const std::ve return result; } -static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) { +static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { std::string result; - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - llama_token eos_token = llama_vocab_eos(vocab); + const llama_model * mdl = llama_get_model(ctx); + llama_token eos_token = llama_token_eos(mdl); llama_kv_cache_clear(ctx); llama_set_embeddings(ctx, false); @@ -108,29 +104,33 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - std::vector inputs = common_tokenize(vocab, prompt, false, true); + std::vector inputs = llama_tokenize(mdl, prompt, false, true); int32_t i_current_token = 0; while (true) { - common_batch_clear(bat); - { - const int32_t n_inputs = inputs.size(); - - for (int32_t i = 0; i < n_inputs; i++) { - common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); - } + llama_batch_clear(bat); + auto n_inputs = (int32_t)inputs.size(); + for (int32_t i = 0; i < n_inputs; i++) { + llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); } inputs.clear(); llama_decode(ctx, bat); + auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); - llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1); + auto candidates = std::vector(llama_n_vocab(mdl)); + auto n_candidates = (int32_t)candidates.size(); + for (int32_t token = 0; token < n_candidates; token++) { + candidates[token] = llama_token_data{ token, logits[token], 0.0f }; + } + auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false }; + llama_token token = llama_sample_token_greedy(ctx, &candidates_p); if (token == eos_token) { break; } - std::string piece = common_token_to_piece(ctx, token); + std::string piece = llama_token_to_piece(ctx, token); if (stream) { std::printf("%s", piece.c_str()); std::fflush(stdout); @@ -155,31 +155,22 @@ static std::string gritlm_instruction(const std::string & instruction) { } int main(int argc, char * argv[]) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - - llama_model_params mparams = common_model_params_to_llama(params); - llama_context_params cparams = common_context_params_to_llama(params); + llama_model_params mparams = llama_model_params_from_gpt_params(params); + llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_backend_init(); - llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams); + llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); // create generation context - llama_context * ctx = llama_init_from_model(model, cparams); - - auto sparams = llama_sampler_chain_default_params(); - - sparams.no_perf = false; - - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + llama_context * ctx = llama_new_context_with_model(mdl, cparams); // ### Embedding/Representation ### // samples taken from: https://github.com/ContextualAI/gritlm#basic @@ -200,12 +191,12 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const int n_embd = llama_model_n_embd(model); + const int n_embd = llama_n_embd(mdl); - const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); - const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); @@ -217,12 +208,11 @@ int main(int argc, char * argv[]) { // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction { const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; - std::string response = generate(ctx, smpl, prompt, true); + std::string response = generate(ctx, prompt, true); } - llama_sampler_free(smpl); llama_free(ctx); - llama_model_free(model); + llama_free_model(mdl); llama_backend_free(); return 0; diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt index 412696c47..d4c8265bd 100644 --- a/examples/imatrix/CMakeLists.txt +++ b/examples/imatrix/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 9c056986b..29602881a 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/imatrix -Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 ## Usage @@ -25,6 +25,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash +GGML_CUDA=1 make -j + # generate importance matrix (imatrix.dat) ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index b5f3feb9f..574f5ed9c 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,12 +1,11 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include #include #include #include +#include #include #include #include @@ -18,13 +17,15 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); - LOG("\n"); + LOG_TEE("\n"); } struct Stats { @@ -36,13 +37,13 @@ struct Stats { class IMatrixCollector { public: IMatrixCollector() = default; - void set_params(common_params params) { m_params = std::move(params); } + void set_params(gpt_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * fname); + bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; - common_params m_params; + gpt_params m_params; std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; @@ -125,10 +126,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.counts.resize(src1->ne[0]*n_as, 0); } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { - LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ABORT("fatal error"); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); + exit(1); //GGML_ASSERT(false); + } + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); } - LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); // loop over all possible experts, regardless if they are used or not in the batch for (int ex = 0; ex < n_as; ++ex) { size_t e_start = ex*src1->ne[0]; @@ -149,8 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.values[e_start + j] += x[j]*x[j]; e.counts[e_start + j]++; if (!std::isfinite(e.values[e_start + j])) { - LOG("\n"); - LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str()); + fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); exit(1); } } @@ -173,18 +175,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.counts.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ABORT("fatal error"); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); } ++e.ncall; - LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; e.counts[j]++; if (!std::isfinite(e.values[j])) { - LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str()); + fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); exit(1); } } @@ -236,17 +240,17 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (n_zeros != 0 && is_first) { - LOG_INF("\n"); + fprintf(stderr, "\n"); is_first = false; } if (n_zeros == n_all) { - LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); + fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); continue; } if (n_zeros > 0) { - LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); + fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); continue; } @@ -255,7 +259,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (to_store.size() < m_stats.size()) { - LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); + fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); } std::ofstream out(fname, std::ios::binary); @@ -287,20 +291,21 @@ void IMatrixCollector::save_imatrix(int ncall) const { out.write(m_params.prompt_file.c_str(), len); } - LOGV(1, "\n"); - LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); + if (m_params.verbosity > 0) { + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); + } } bool IMatrixCollector::load_imatrix(const char * fname) { std::ifstream in(fname, std::ios::binary); if (!in) { - LOG_ERR("%s: failed to open %s\n",__func__, fname); + printf("%s: failed to open %s\n",__func__, fname); return false; } int n_entries; in.read((char*)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { - LOG_ERR("%s: no data in file %s\n", __func__, fname); + printf("%s: no data in file %s\n", __func__, fname); return false; } for (int i = 0; i < n_entries; ++i) { @@ -308,7 +313,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); + printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); return false; } name_as_vec[len] = 0; @@ -319,7 +324,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i); + printf("%s: failed reading number of values for entry %d\n",__func__,i); m_stats = {}; return false; } @@ -332,7 +337,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector tmp(nval); in.read((char*)tmp.data(), nval*sizeof(float)); if (in.fail()) { - LOG_ERR("%s: failed reading data for entry %d\n",__func__,i); + printf("%s: failed reading data for entry %d\n",__func__,i); m_stats = {}; return false; } @@ -427,35 +432,32 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const common_params & params) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const bool add_bos = llama_vocab_get_add_bos(vocab); +static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); const int n_ctx = llama_n_ctx(ctx); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); - auto tim1 = std::chrono::high_resolution_clock::now(); - LOG_INF("%s: tokenizing the input ..\n", __func__); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = common_tokenize(ctx, params.prompt, true); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); - LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (params.i_chunk > 0) { if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { - LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); + fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); return false; } - LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); + fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); } if (int(tokens.size()) < 2*n_ctx) { - LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx); - LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size()); + fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, + n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return false; } @@ -470,14 +472,14 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_vocab_n_tokens(vocab); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; int count = 0; double nll = 0.0; double nll2 = 0.0; - LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -499,8 +501,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { // clear the KV cache llama_kv_cache_clear(ctx); - llama_batch batch = llama_batch_init(n_batch, 0, 1); - for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); @@ -510,17 +510,12 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_vocab_bos(vocab); + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - common_batch_clear(batch); - for (int i = 0; i < batch_size; i++) { - common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); - } - - if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval\n", __func__); - llama_batch_free(batch); + // TODO: use batch.logits to save computations instead of relying on logits_all == true + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -533,35 +528,33 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { } } - llama_batch_free(batch); - const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - LOG("%d hours ", total_seconds / (60*60)); + fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - LOG("%.2f minutes\n", total_seconds / 60.0); + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } if (params.compute_ppl) { const int first = n_ctx/2; - const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); count += n_ctx - first - 1; - LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); fflush(stdout); logits.clear(); } } - LOG("\n"); + printf("\n"); if (params.compute_ppl) { nll2 /= count; @@ -570,9 +563,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { - LOG("Unexpected negative standard deviation of log(prob)\n"); + printf("Unexpected negative standard deviation of log(prob)\n"); } } @@ -580,32 +573,31 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { } int main(int argc, char ** argv) { - common_params params; + gpt_params params; params.n_ctx = 512; params.logits_all = true; - params.escape = false; + params.verbosity = 1; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - common_init(); - params.n_batch = std::min(params.n_batch, params.n_ctx); g_collector.set_params(params); for (const auto & in_file : params.in_files) { - LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); + printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); if (!g_collector.load_imatrix(in_file.c_str())) { - LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str()); + fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); return 1; } } if (params.in_files.size() > 1) { - LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); + printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); g_collector.save_imatrix(); } @@ -619,45 +611,37 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr || ctx == nullptr) { - LOG_ERR("%s : failed to init\n", __func__); + fprintf(stderr, "%s : failed to init\n", __func__); return 1; } - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { - LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } - if (params.prompt.empty()) { - if (params.in_files.empty()) { - LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n"); - return 1; - } - LOG_INF("No prompt provided; combining precomputed matrices only.\n"); - } else { - if (!compute_imatrix(ctx, params)) { - return 1; - } + if (!compute_imatrix(ctx, params)) { + return 1; } - g_collector.save_imatrix(); - LOG("\n"); - llama_perf_context_print(ctx); + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); llama_backend_free(); diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index fb26628d8..9b1aa3b63 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-infill) add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/infill/README.md b/examples/infill/README.md index df4d976f2..810a0c5e7 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -14,7 +14,7 @@ In this section, we cover the most commonly used options for running the `infill - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference. +- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. ## Input Prompts diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 489a208b6..dc93d2301 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -1,9 +1,8 @@ -#include "arg.h" #include "common.h" + #include "console.h" -#include "sampling.h" -#include "log.h" #include "llama.h" +#include "grammar-parser.h" #include #include @@ -35,14 +34,57 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static common_sampler ** g_smpl; -static common_params * g_params; +static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; static bool is_interacting = false; +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = string_get_sortable_timestamp(); + + const bool success = fs_create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: infill\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + yaml_dump_string_multiline(logfile, "output", output.c_str()); + yaml_dump_vector_int(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -50,13 +92,9 @@ static void sigint_handler(int signo) { is_interacting = true; } else { console::cleanup(); - LOG("\n"); - common_perf_print(*g_ctx, *g_smpl); - - // make sure all logs are flushed - LOG("Interrupted by user\n"); - common_log_pause(common_log_main()); - + printf("\n"); + llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } } @@ -64,135 +102,143 @@ static void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { - common_params params; + gpt_params params; + llama_sampling_params & sparams = params.sparams; g_params = ¶ms; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - - auto & sparams = params.sampling; +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("infill", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); if (params.logits_all) { - LOG_ERR("\n************\n"); - LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - LOG_ERR("************\n\n"); + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); return 0; } if (params.embedding) { - LOG_ERR("\n************\n"); - LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - LOG_ERR("************\n\n"); + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__); + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } - if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { - LOG_ERR("\n************\n"); - LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); - LOG_ERR("************\n\n"); + printf("\n************\n"); + printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); + printf("************\n\n"); return 0; } if (params.rope_freq_base != 0.0) { - LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_INF("%s: llama backend init\n", __func__); + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + + LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = nullptr; - llama_context * ctx = nullptr; - common_sampler * smpl = nullptr; + llama_model * model; + llama_context * ctx; g_model = &model; g_ctx = &ctx; - g_smpl = &smpl; // load the model and apply lora adapter, if any - LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = common_init_from_params(params); - - model = llama_init.model.get(); - ctx = llama_init.context.get(); + LOG("%s: load the model and apply lora adapter, if any\n", __func__); + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { - LOG_ERR("%s: unable to load model\n", __func__); + LOG_TEE("%s: error: unable to load model\n", __func__); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG_DBG("n_ctx: %d\n", n_ctx); + LOG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_TEE("\n"); + LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); } - const bool add_bos = llama_vocab_get_add_bos(vocab); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + const bool add_bos = llama_should_add_bos_token(model); + GGML_ASSERT(llama_add_eos_token(model) != 1); + LOG("add_bos: %d\n", add_bos); std::vector embd_inp; std::vector embd_end; - std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0); - GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0); + GGML_ASSERT(llama_token_prefix(model) >= 0); + GGML_ASSERT(llama_token_suffix(model) >= 0); - inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); - inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_vocab_fim_mid(vocab); + const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } - LOG_DBG("add_bos: %d\n", add_bos); - LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str()); - LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str()); - LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); + LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); + LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); // Should not run without any tokens if (embd_inp.empty()) { - embd_inp.push_back(llama_vocab_bos(vocab)); - LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); + embd_inp.push_back(llama_token_bos(model)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } if ((int) embd_inp.size() > n_ctx - 4) { - LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -201,8 +247,9 @@ int main(int argc, char ** argv) { params.n_keep = (int)embd_inp.size(); } - LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str()); - LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str()); + LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); + LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); + // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -210,21 +257,21 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_INF("\n"); - LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > 0) { - LOG_INF("%s: static prompt based on n_keep: '", __func__); + LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_CNT("'\n"); + LOG_TEE("'\n"); } - LOG_INF("\n"); + LOG_TEE("\n"); } if (params.interactive) { @@ -241,30 +288,30 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG_INF("%s: interactive mode on.\n", __func__); + LOG_TEE("%s: interactive mode on.\n", __func__); if (params.input_prefix_bos) { - LOG_INF("Input prefix with BOS\n"); + LOG_TEE("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); } if (!params.input_suffix.empty()) { - LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - smpl = common_sampler_init(model, sparams); + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("\n\n"); - LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); - LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); - - LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); - - LOG_INF("\n"); - LOG_INF("\n##### Infill mode #####\n\n"); + LOG_TEE("\n##### Infill mode #####\n\n"); + if (params.infill) { + printf("\n************\n"); + printf("no need to specify '--infill', always running infill\n"); + printf("************\n\n"); + } if (params.interactive) { const char *control_message; if (params.multiline_input) { @@ -275,11 +322,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_INF("== Running in interactive mode. ==\n"); + LOG_TEE("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_INF( " - Press Ctrl+C to interject at any time.\n"); + LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_INF( "%s\n", control_message); + LOG_TEE( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -299,6 +346,8 @@ int main(int argc, char ** argv) { std::vector embd; + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + while (n_remain != 0 || params.interactive) { // predict if (!embd.empty()) { @@ -312,8 +361,9 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); + fflush(stdout); } // infinite text generation via context swapping @@ -322,14 +372,14 @@ int main(int argc, char ** argv) { // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches if (n_past + (int) embd.size() > n_ctx) { if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; } const int n_left = n_past - params.n_keep - 1; const int n_discard = n_left/2; - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); @@ -337,9 +387,9 @@ int main(int argc, char ** argv) { n_past -= n_discard; - LOG_DBG("after swap: n_past = %d\n", n_past); + LOG("after swap: n_past = %d\n", n_past); - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); } @@ -351,16 +401,16 @@ int main(int argc, char ** argv) { n_eval = params.n_batch; } - LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { - LOG_ERR("%s : failed to eval\n", __func__); + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG_DBG("n_past = %d\n", n_past); + LOG("n_past = %d\n", n_past); } } @@ -368,11 +418,11 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = common_sampler_sample(smpl, ctx, -1); + const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); - common_sampler_accept(smpl, id, true); + llama_sampling_accept(ctx_sampling, ctx, id, true); - // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); embd.push_back(id); @@ -382,16 +432,16 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG_DBG("n_remain: %d\n", n_remain); + LOG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - common_sampler_accept(smpl, embd_inp[n_consumed], false); + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -403,8 +453,8 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - const std::string token_str = common_token_to_piece(ctx, id); - LOG("%s", token_str.c_str()); + const std::string token_str = llama_token_to_piece(ctx, id); + printf("%s", token_str.c_str()); if (embd.size() > 1) { input_tokens.push_back(id); @@ -413,6 +463,7 @@ int main(int argc, char ** argv) { output_ss << token_str; } } + fflush(stdout); } // reset color to default if we there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { @@ -422,12 +473,13 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){ + if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str()); + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } - LOG("\n"); + fflush(stdout); + printf("\n"); console::set_display(console::user_input); std::string buffer; std::string line; @@ -462,16 +514,16 @@ int main(int argc, char ** argv) { } // tokenize new prefix and suffix - std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); - inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); @@ -483,33 +535,35 @@ int main(int argc, char ** argv) { n_remain = params.n_predict; n_past = 0; n_consumed = 0; + // LOG_TEE("took new input\n"); is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { - LOG_DBG("found EOS token\n"); + else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { + LOG("found EOS token\n"); if (params.interactive) { is_interacting = true; - LOG("\n"); + printf("\n"); console::set_display(console::user_input); + fflush(stdout); } } if (n_past > 0 && is_interacting && !params.interactive) { - LOG_DBG("waiting for user input\n"); + LOG("waiting for user input\n"); if (params.input_prefix_bos) { - LOG_DBG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_vocab_bos(vocab)); + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(model)); } std::string buffer; if (!params.input_prefix.empty()) { - LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); buffer += params.input_prefix; - LOG("%s", buffer.c_str()); + printf("%s", buffer.c_str()); } std::string line; @@ -527,30 +581,30 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty()) { - LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); buffer += params.input_suffix; - LOG("%s", params.input_suffix.c_str()); + printf("%s", params.input_suffix.c_str()); } - LOG_DBG("buffer: '%s'\n", buffer.c_str()); + LOG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); - const auto line_inp = common_tokenize(ctx, buffer, false); - LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); + const auto line_inp = ::llama_tokenize(ctx, buffer, false); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << common_token_to_piece(ctx, token); + output_ss << llama_token_to_piece(ctx, token); } n_remain -= line_inp.size(); - LOG_DBG("n_remain: %d\n", n_remain); + LOG("n_remain: %d\n", n_remain); } else { - LOG_DBG("empty line, passing control back\n"); + LOG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -558,14 +612,14 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - common_sampler_reset(smpl); + llama_sampling_reset(ctx_sampling); } is_interacting = false; } } // end of generation - if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) { + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { break; } @@ -577,14 +631,22 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str()); + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + fflush(stdout); } - LOG("\n"); - common_perf_print(ctx, smpl); + llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - common_sampler_free(smpl); + llama_free(ctx); + llama_free_model(model); + + llama_sampling_free(ctx_sampling); llama_backend_free(); +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n"); +#endif // LOG_DISABLE_LOGS + return 0; } diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index fc9f0097f..a8779bf3b 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -540,7 +540,7 @@ class SchemaConverter: return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space") def _resolve_ref(self, ref): diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt index 17e3b9b87..5bdbea4e2 100644 --- a/examples/llama-bench/CMakeLists.txt +++ b/examples/llama-bench/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 6bbe4bb75..52b0e74d3 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -14,8 +14,7 @@ Performance testing tool for llama.cpp. 1. [Markdown](#markdown) 2. [CSV](#csv) 3. [JSON](#json) - 4. [JSONL](#jsonl) - 5. [SQL](#sql) + 4. [SQL](#sql) ## Syntax @@ -24,34 +23,27 @@ usage: ./llama-bench [options] options: -h, --help - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: ) - -b, --batch-size (default: 2048) - -ub, --ubatch-size (default: 512) - -ctk, --cache-type-k (default: f16) - -ctv, --cache-type-v (default: f16) - -t, --threads (default: 8) - -C, --cpu-mask (default: 0x0) - --cpu-strict <0|1> (default: 0) - --poll <0...100> (default: 50) - -ngl, --n-gpu-layers (default: 99) - -rpc, --rpc (default: ) - -sm, --split-mode (default: layer) - -mg, --main-gpu (default: 0) - -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) - -mmp, --mmap <0|1> (default: 1) - --numa (default: disabled) - -embd, --embeddings <0|1> (default: 0) - -ts, --tensor-split (default: 0) - -r, --repetitions (default: 5) - --prio <0|1|2|3> (default: 0) - --delay <0...N> (seconds) (default: 0) - -o, --output (default: md) - -oe, --output-err (default: none) - -v, --verbose (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -pg (default: 512,128) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: 16) + -ngl, --n-gpu-layers (default: 99) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) + -mmp, --mmap <0|1> (default: 1) + --numa (default: disabled) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) + -r, --repetitions (default: 5) + -o, --output (default: md) + -v, --verbose (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` @@ -246,19 +238,6 @@ $ ./llama-bench -o json ] ``` - -### JSONL - -```sh -$ ./llama-bench -o jsonl -``` - -```json lines -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} -``` - - ### SQL SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 4ac19ca86..a6497b6e0 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -6,28 +6,25 @@ #include #include #include -#include #include #include +#include #include #include #include #include #include #include -#include #include -#include "common.h" #include "ggml.h" #include "llama.h" +#include "common.h" +#include "ggml-cuda.h" +#include "ggml-sycl.h" -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include +#ifdef GGML_USE_CANN +#include "ggml-cann.h" #endif // utils @@ -36,7 +33,8 @@ static uint64_t get_time_ns() { return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); } -template static std::string join(const std::vector & values, const std::string & delim) { +template +static std::string join(const std::vector & values, const std::string & delim) { std::ostringstream str; for (size_t i = 0; i < values.size(); i++) { str << values[i]; @@ -47,73 +45,112 @@ template static std::string join(const std::vector & values, const return str.str(); } -template static std::vector transform_to_str(const std::vector & values, F f) { +template +static std::vector transform_to_str(const std::vector & values, F f) { std::vector str_values; std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); return str_values; } -template static T avg(const std::vector & v) { +template +static T avg(const std::vector & v) { if (v.empty()) { return 0; } T sum = std::accumulate(v.begin(), v.end(), T(0)); - return sum / (T) v.size(); + return sum / (T)v.size(); } -template static T stdev(const std::vector & v) { +template +static T stdev(const std::vector & v) { if (v.size() <= 1) { return 0; } - T mean = avg(v); + T mean = avg(v); T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1)); + T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); return stdev; } static std::string get_cpu_info() { - std::vector cpu_list; - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { - cpu_list.push_back(ggml_backend_dev_description(dev)); + std::string id; +#ifdef __linux__ + FILE * f = fopen("/proc/cpuinfo", "r"); + if (f) { + char buf[1024]; + while (fgets(buf, sizeof(buf), f)) { + if (strncmp(buf, "model name", 10) == 0) { + char * p = strchr(buf, ':'); + if (p) { + p++; + while (std::isspace(*p)) { + p++; + } + while (std::isspace(p[strlen(p) - 1])) { + p[strlen(p) - 1] = '\0'; + } + id = p; + break; + } + } } + fclose(f); } - return join(cpu_list, ", "); +#endif + // TODO: other platforms + return id; } static std::string get_gpu_info() { - std::vector gpu_list; - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) { - gpu_list.push_back(ggml_backend_dev_description(dev)); + std::string id; +#ifdef GGML_USE_CUDA + int count = ggml_backend_cuda_get_device_count(); + for (int i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; } } - return join(gpu_list, ", "); +#endif +#ifdef GGML_USE_SYCL + int count = ggml_backend_sycl_get_device_count(); + for (int i = 0; i < count; i++) { + char buf[128]; + ggml_sycl_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } +#endif +#ifdef GGML_USE_CANN + uint32_t count = ggml_backend_cann_get_device_count(); + for (uint32_t i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } +#endif + // TODO: other backends + return id; } // command line params -enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; +enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; static const char * output_format_str(output_formats format) { switch (format) { - case NONE: - return "none"; - case CSV: - return "csv"; - case JSON: - return "json"; - case JSONL: - return "jsonl"; - case MARKDOWN: - return "md"; - case SQL: - return "sql"; - default: - GGML_ABORT("invalid output format"); + case NONE: return "none"; + case CSV: return "csv"; + case JSON: return "json"; + case MARKDOWN: return "md"; + case SQL: return "sql"; + default: GGML_ASSERT(!"invalid output format"); } } @@ -124,8 +161,6 @@ static bool output_format_from_str(const std::string & s, output_formats & forma format = CSV; } else if (s == "json") { format = JSON; - } else if (s == "jsonl") { - format = JSONL; } else if (s == "md") { format = MARKDOWN; } else if (s == "sql") { @@ -138,14 +173,10 @@ static bool output_format_from_str(const std::string & s, output_formats & forma static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: - return "none"; - case LLAMA_SPLIT_MODE_LAYER: - return "layer"; - case LLAMA_SPLIT_MODE_ROW: - return "row"; - default: - GGML_ABORT("invalid split mode"); + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; + default: GGML_ASSERT(!"invalid split mode"); } } @@ -156,65 +187,53 @@ static std::string pair_str(const std::pair & p) { } struct cmd_params { - std::vector model; - std::vector n_prompt; - std::vector n_gen; + std::vector model; + std::vector n_prompt; + std::vector n_gen; std::vector> n_pg; - std::vector n_batch; - std::vector n_ubatch; - std::vector type_k; - std::vector type_v; - std::vector n_threads; - std::vector cpu_mask; - std::vector cpu_strict; - std::vector poll; - std::vector n_gpu_layers; - std::vector rpc_servers; - std::vector split_mode; - std::vector main_gpu; - std::vector no_kv_offload; - std::vector flash_attn; - std::vector> tensor_split; - std::vector use_mmap; - std::vector embeddings; - ggml_numa_strategy numa; - int reps; - ggml_sched_priority prio; - int delay; - bool verbose; - bool progress; - output_formats output_format; - output_formats output_format_stderr; + std::vector n_batch; + std::vector n_ubatch; + std::vector type_k; + std::vector type_v; + std::vector n_threads; + std::vector n_gpu_layers; + std::vector rpc_servers; + std::vector split_mode; + std::vector main_gpu; + std::vector no_kv_offload; + std::vector flash_attn; + std::vector> tensor_split; + std::vector use_mmap; + std::vector embeddings; + ggml_numa_strategy numa; + int reps; + bool verbose; + output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ { "models/7B/ggml-model-q4_0.gguf" }, - /* n_prompt */ { 512 }, - /* n_gen */ { 128 }, + /* model */ {"models/7B/ggml-model-q4_0.gguf"}, + /* n_prompt */ {512}, + /* n_gen */ {128}, /* n_pg */ {}, - /* n_batch */ { 2048 }, - /* n_ubatch */ { 512 }, - /* type_k */ { GGML_TYPE_F16 }, - /* type_v */ { GGML_TYPE_F16 }, - /* n_threads */ { cpu_get_num_math() }, - /* cpu_mask */ { "0x0" }, - /* cpu_strict */ { false }, - /* poll */ { 50 }, - /* n_gpu_layers */ { 99 }, - /* rpc_servers */ { "" }, - /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, - /* main_gpu */ { 0 }, - /* no_kv_offload */ { false }, - /* flash_attn */ { false }, - /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, - /* use_mmap */ { true }, - /* embeddings */ { false }, + /* n_batch */ {2048}, + /* n_ubatch */ {512}, + /* type_k */ {GGML_TYPE_F16}, + /* type_v */ {GGML_TYPE_F16}, + /* n_threads */ {cpu_get_num_math()}, + /* n_gpu_layers */ {99}, + /* rpc_servers */ {""}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, + /* main_gpu */ {0}, + /* no_kv_offload */ {false}, + /* flash_attn */ {false}, + /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* use_mmap */ {true}, + /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, - /* prio */ GGML_SCHED_PRIO_NORMAL, - /* delay */ 0, /* verbose */ false, - /* progress */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, }; @@ -224,69 +243,37 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", - join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", - join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", - join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", - join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", - join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", - join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", - join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -C, --cpu-mask (default: %s)\n", - join(cmd_params_defaults.cpu_mask, ",").c_str()); - printf(" --cpu-strict <0|1> (default: %s)\n", - join(cmd_params_defaults.cpu_strict, ",").c_str()); - printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", - join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", - join(cmd_params_defaults.rpc_servers, ",").c_str()); - } - printf(" -sm, --split-mode (default: %s)\n", - join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", - join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", - join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", - join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -mmp, --mmap <0|1> (default: %s)\n", - join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" --numa (default: disabled)\n"); - printf(" -embd, --embeddings <0|1> (default: %s)\n", - join(cmd_params_defaults.embeddings, ",").c_str()); - printf(" -ts, --tensor-split (default: 0)\n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); - printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); - printf(" -o, --output (default: %s)\n", - output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", - output_format_str(cmd_params_defaults.output_format_stderr)); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); - printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" --numa (default: disabled)\n"); + printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -ts, --tensor-split (default: 0)\n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); - printf( - "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " - "multiple times.\n"); + printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { if (s == "f16") { return GGML_TYPE_F16; } - if (s == "bf16") { - return GGML_TYPE_BF16; - } if (s == "q8_0") { return GGML_TYPE_Q8_0; } @@ -309,21 +296,19 @@ static ggml_type ggml_type_from_name(const std::string & s) { return GGML_TYPE_COUNT; } -static cmd_params parse_cmd_params(int argc, char ** argv) { - cmd_params params; - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - const char split_delim = ','; - params.verbose = cmd_params_defaults.verbose; - params.output_format = cmd_params_defaults.output_format; +static cmd_params parse_cmd_params(int argc, char ** argv) { + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; + + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; - params.reps = cmd_params_defaults.reps; - params.numa = cmd_params_defaults.numa; - params.prio = cmd_params_defaults.prio; - params.delay = cmd_params_defaults.delay; - params.progress = cmd_params_defaults.progress; + params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -365,7 +350,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -385,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -395,16 +380,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } types.push_back(gt); } - if (invalid_param) { - break; - } params.type_k.insert(params.type_k.end(), types.begin(), types.end()); } else if (arg == "-ctv" || arg == "--cache-type-v") { if (++i >= argc) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -414,9 +396,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } types.push_back(gt); } - if (invalid_param) { - break; - } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { @@ -425,27 +404,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); - } else if (arg == "-C" || arg == "--cpu-mask") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end()); - } else if (arg == "--cpu-strict") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end()); - } else if (arg == "--poll") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.poll.insert(params.poll.end(), p.begin(), p.end()); } else if (arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; @@ -453,7 +411,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); - } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { + } else if (arg == "-rpc" || arg == "--rpc") { if (++i >= argc) { invalid_param = true; break; @@ -464,7 +422,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector modes; for (const auto & m : p) { llama_split_mode mode; @@ -480,9 +438,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } modes.push_back(mode); } - if (invalid_param) { - break; - } params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { @@ -503,16 +458,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { - params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; - } else if (value == "isolate") { - params.numa = GGML_NUMA_STRATEGY_ISOLATE; - } else if (value == "numactl") { - params.numa = GGML_NUMA_STRATEGY_NUMACTL; - } else { - invalid_param = true; - break; - } + /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; break; } } } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { @@ -542,9 +491,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } for (auto ts : string_split(argv[i], split_delim)) { // split string by ; and / - const std::regex regex{ R"([;/]+)" }; - std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; - std::vector split_arg{ it, {} }; + const std::regex regex{R"([;/]+)"}; + std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; + std::vector split_arg{it, {}}; GGML_ASSERT(split_arg.size() <= llama_max_devices()); std::vector tensor_split(llama_max_devices()); @@ -563,18 +512,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.reps = std::stoi(argv[i]); - } else if (arg == "--prio") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.prio = (enum ggml_sched_priority) std::stoi(argv[i]); - } else if (arg == "--delay") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.delay = std::stoi(argv[i]); } else if (arg == "-o" || arg == "--output") { if (++i >= argc) { invalid_param = true; @@ -589,8 +526,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); } else if (arg == "-v" || arg == "--verbose") { params.verbose = true; - } else if (arg == "--progress") { - params.progress = true; } else { invalid_param = true; break; @@ -603,156 +538,83 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } // set defaults - if (params.model.empty()) { - params.model = cmd_params_defaults.model; - } - if (params.n_prompt.empty()) { - params.n_prompt = cmd_params_defaults.n_prompt; - } - if (params.n_gen.empty()) { - params.n_gen = cmd_params_defaults.n_gen; - } - if (params.n_pg.empty()) { - params.n_pg = cmd_params_defaults.n_pg; - } - if (params.n_batch.empty()) { - params.n_batch = cmd_params_defaults.n_batch; - } - if (params.n_ubatch.empty()) { - params.n_ubatch = cmd_params_defaults.n_ubatch; - } - if (params.type_k.empty()) { - params.type_k = cmd_params_defaults.type_k; - } - if (params.type_v.empty()) { - params.type_v = cmd_params_defaults.type_v; - } - if (params.n_gpu_layers.empty()) { - params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; - } - if (params.rpc_servers.empty()) { - params.rpc_servers = cmd_params_defaults.rpc_servers; - } - if (params.split_mode.empty()) { - params.split_mode = cmd_params_defaults.split_mode; - } - if (params.main_gpu.empty()) { - params.main_gpu = cmd_params_defaults.main_gpu; - } - if (params.no_kv_offload.empty()) { - params.no_kv_offload = cmd_params_defaults.no_kv_offload; - } - if (params.flash_attn.empty()) { - params.flash_attn = cmd_params_defaults.flash_attn; - } - if (params.tensor_split.empty()) { - params.tensor_split = cmd_params_defaults.tensor_split; - } - if (params.use_mmap.empty()) { - params.use_mmap = cmd_params_defaults.use_mmap; - } - if (params.embeddings.empty()) { - params.embeddings = cmd_params_defaults.embeddings; - } - if (params.n_threads.empty()) { - params.n_threads = cmd_params_defaults.n_threads; - } - if (params.cpu_mask.empty()) { - params.cpu_mask = cmd_params_defaults.cpu_mask; - } - if (params.cpu_strict.empty()) { - params.cpu_strict = cmd_params_defaults.cpu_strict; - } - if (params.poll.empty()) { - params.poll = cmd_params_defaults.poll; - } + if (params.model.empty()) { params.model = cmd_params_defaults.model; } + if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } + if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } + if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } + if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } + if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } + if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } + if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } + if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } + if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } + if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } + if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } + if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } + if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } + if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } + if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } return params; } struct cmd_params_instance { - std::string model; - int n_prompt; - int n_gen; - int n_batch; - int n_ubatch; - ggml_type type_k; - ggml_type type_v; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - int n_gpu_layers; - std::string rpc_servers_str; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; + std::string model; + int n_prompt; + int n_gen; + int n_batch; + int n_ubatch; + ggml_type type_k; + ggml_type type_v; + int n_threads; + int n_gpu_layers; + std::string rpc_servers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; std::vector tensor_split; - bool use_mmap; - bool embeddings; + bool use_mmap; + bool embeddings; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; - if (!rpc_servers_str.empty()) { - auto rpc_servers = string_split(rpc_servers_str, ','); - - // add RPC devices - if (!rpc_servers.empty()) { - ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); - if (!rpc_reg) { - fprintf(stderr, "%s: failed to find RPC backend\n", __func__); - exit(1); - } - - typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); - ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); - if (!ggml_backend_rpc_add_device_fn) { - fprintf(stderr, "%s: failed to find RPC device add function\n", __func__); - exit(1); - } - static std::vector devices; - devices.clear(); - for (const std::string & server : rpc_servers) { - ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); - if (dev) { - devices.push_back(dev); - } else { - fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); - exit(1); - } - } - devices.push_back(nullptr); - mparams.devices = devices.data(); - } + if (!rpc_servers.empty()) { + mparams.rpc_servers = rpc_servers.c_str(); } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; + mparams.use_mmap = use_mmap; return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && - split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && + return model == other.model && + n_gpu_layers == other.n_gpu_layers && + rpc_servers == other.rpc_servers && + split_mode == other.split_mode && + main_gpu == other.main_gpu && + use_mmap == other.use_mmap && tensor_split == other.tensor_split; } llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; return cparams; } @@ -762,7 +624,6 @@ static std::vector get_cmd_params_instances(const cmd_param std::vector instances; // this ordering minimizes the number of times that each model needs to be reloaded - // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & rpc : params.rpc_servers) @@ -777,10 +638,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & tv : params.type_v) for (const auto & nkvo : params.no_kv_offload) for (const auto & fa : params.flash_attn) - for (const auto & nt : params.n_threads) - for (const auto & cm : params.cpu_mask) - for (const auto & cs : params.cpu_strict) - for (const auto & pl : params.poll) { + for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -794,9 +652,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, - /* .cpu_mask = */ cm, - /* .cpu_strict = */ cs, - /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, @@ -823,9 +678,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, - /* .cpu_mask = */ cm, - /* .cpu_strict = */ cs, - /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, @@ -852,9 +704,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, - /* .cpu_mask = */ cm, - /* .cpu_strict = */ cs, - /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, @@ -868,125 +717,157 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } - // clang-format on return instances; } struct test { static const std::string build_commit; - static const int build_number; + static const int build_number; + static const bool cuda; + static const bool vulkan; + static const bool kompute; + static const bool metal; + static const bool sycl; + static const bool gpu_blas; + static const bool blas; static const std::string cpu_info; static const std::string gpu_info; - std::string model_filename; - std::string model_type; - uint64_t model_size; - uint64_t model_n_params; - int n_batch; - int n_ubatch; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - ggml_type type_k; - ggml_type type_v; - int n_gpu_layers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - int n_prompt; - int n_gen; - std::string test_time; - std::vector samples_ns; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_ubatch; + int n_threads; + bool has_rpc; + ggml_type type_k; + ggml_type type_v; + int n_gpu_layers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector tensor_split; + bool use_mmap; + bool embeddings; + int n_prompt; + int n_gen; + std::string test_time; + std::vector samples_ns; test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; llama_model_desc(lmodel, buf, sizeof(buf)); - model_type = buf; - model_size = llama_model_size(lmodel); + model_type = buf; + model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); - n_batch = inst.n_batch; - n_ubatch = inst.n_ubatch; - n_threads = inst.n_threads; - cpu_mask = inst.cpu_mask; - cpu_strict = inst.cpu_strict; - poll = inst.poll; - type_k = inst.type_k; - type_v = inst.type_v; - n_gpu_layers = inst.n_gpu_layers; - split_mode = inst.split_mode; - main_gpu = inst.main_gpu; - no_kv_offload = inst.no_kv_offload; - flash_attn = inst.flash_attn; - tensor_split = inst.tensor_split; - use_mmap = inst.use_mmap; - embeddings = inst.embeddings; - n_prompt = inst.n_prompt; - n_gen = inst.n_gen; + n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; + n_threads = inst.n_threads; + has_rpc = !inst.rpc_servers.empty(); + type_k = inst.type_k; + type_v = inst.type_v; + n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; + main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; + tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; + embeddings = inst.embeddings; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; // RFC 3339 date-time format - time_t t = time(NULL); + time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); test_time = buf; (void) ctx; } - uint64_t avg_ns() const { return ::avg(samples_ns); } + uint64_t avg_ns() const { + return ::avg(samples_ns); + } - uint64_t stdev_ns() const { return ::stdev(samples_ns); } + uint64_t stdev_ns() const { + return ::stdev(samples_ns); + } std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + int n_tokens = n_prompt + n_gen; std::vector ts; - std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), - [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } - double avg_ts() const { return ::avg(get_ts()); } + double avg_ts() const { + return ::avg(get_ts()); + } - double stdev_ts() const { return ::stdev(get_ts()); } + double stdev_ts() const { + return ::stdev(get_ts()); + } static std::string get_backend() { - std::vector backends; - for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - auto * reg = ggml_backend_reg_get(i); - std::string name = ggml_backend_reg_name(reg); - if (name != "CPU") { - backends.push_back(ggml_backend_reg_name(reg)); - } + if (cuda) { + return GGML_CUDA_NAME; } - return backends.empty() ? "CPU" : join(backends, ","); + if (vulkan) { + return "Vulkan"; + } + if (kompute) { + return "Kompute"; + } + if (metal) { + return "Metal"; + } + if (sycl) { + return GGML_SYCL_NAME; + } + if (gpu_blas) { + return "GPU BLAS"; + } + if (blas) { + return "BLAS"; + } + + return "CPU"; } static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", + "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", + "cpu_info", "gpu_info", + "model_filename", "model_type", "model_size", "model_n_params", + "n_batch", "n_ubatch", + "n_threads", "type_k", "type_v", + "n_gpu_layers", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", + "tensor_split", "use_mmap", "embeddings", + "n_prompt", "n_gen", "test_time", + "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts" }; return fields; } - enum field_type { STRING, BOOL, INT, FLOAT }; + enum field_type {STRING, BOOL, INT, FLOAT}; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || - field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || - field == "stddev_ns") { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || + field == "n_threads" || + field == "model_size" || field == "model_n_params" || + field == "n_gpu_layers" || field == "main_gpu" || + field == "n_prompt" || field == "n_gen" || + field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "embeddings") { + if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || + field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -997,7 +878,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; - int max_nonzero = 0; + int max_nonzero = 0; for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; @@ -1011,53 +892,43 @@ struct test { tensor_split_str += "/"; } } - std::vector values = { build_commit, - std::to_string(build_number), - cpu_info, - gpu_info, - get_backend(), - model_filename, - model_type, - std::to_string(model_size), - std::to_string(model_n_params), - std::to_string(n_batch), - std::to_string(n_ubatch), - std::to_string(n_threads), - cpu_mask, - std::to_string(cpu_strict), - std::to_string(poll), - ggml_type_name(type_k), - ggml_type_name(type_v), - std::to_string(n_gpu_layers), - split_mode_str(split_mode), - std::to_string(main_gpu), - std::to_string(no_kv_offload), - std::to_string(flash_attn), - tensor_split_str, - std::to_string(use_mmap), - std::to_string(embeddings), - std::to_string(n_prompt), - std::to_string(n_gen), - test_time, - std::to_string(avg_ns()), - std::to_string(stdev_ns()), - std::to_string(avg_ts()), - std::to_string(stdev_ts()) }; + std::vector values = { + build_commit, std::to_string(build_number), + std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan), + std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas), + cpu_info, gpu_info, + model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), + std::to_string(n_batch), std::to_string(n_ubatch), + std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), + std::to_string(n_gpu_layers), split_mode_str(split_mode), + std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), + tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), + std::to_string(n_prompt), std::to_string(n_gen), test_time, + std::to_string(avg_ns()), std::to_string(stdev_ns()), + std::to_string(avg_ts()), std::to_string(stdev_ts()) + }; return values; } std::map get_map() const { std::map map; - auto fields = get_fields(); - auto values = get_values(); - std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()), - std::make_pair); + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), + std::inserter(map, map.end()), std::make_pair); return map; } }; const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; +const bool test::cuda = !!ggml_cpu_has_cuda(); +const bool test::vulkan = !!ggml_cpu_has_vulkan(); +const bool test::kompute = !!ggml_cpu_has_kompute(); +const bool test::metal = !!ggml_cpu_has_metal(); +const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); +const bool test::blas = !!ggml_cpu_has_blas(); +const bool test::sycl = !!ggml_cpu_has_sycl(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); @@ -1065,12 +936,9 @@ struct printer { virtual ~printer() {} FILE * fout; - virtual void print_header(const cmd_params & params) { (void) params; } - virtual void print_test(const test & t) = 0; - - virtual void print_footer() {} + virtual void print_footer() { } }; struct csv_printer : public printer { @@ -1086,7 +954,7 @@ struct csv_printer : public printer { return escaped; } - void print_header(const cmd_params & params) override { + void print_header(const cmd_params & params) override { std::vector fields = test::get_fields(); fprintf(fout, "%s\n", join(fields, ",").c_str()); (void) params; @@ -1099,38 +967,38 @@ struct csv_printer : public printer { } }; -static std::string escape_json(const std::string & value) { - std::string escaped; - for (auto c : value) { - if (c == '"') { - escaped += "\\\""; - } else if (c == '\\') { - escaped += "\\\\"; - } else if (c <= 0x1f) { - char buf[8]; - snprintf(buf, sizeof(buf), "\\u%04x", c); - escaped += buf; - } else { - escaped += c; - } - } - return escaped; -} - -static std::string format_json_value(const std::string & field, const std::string & value) { - switch (test::get_field_type(field)) { - case test::STRING: - return "\"" + escape_json(value) + "\""; - case test::BOOL: - return value == "0" ? "false" : "true"; - default: - return value; - } -} - struct json_printer : public printer { bool first = true; + static std::string escape_json(const std::string & value) { + std::string escaped; + for (auto c : value) { + if (c == '"') { + escaped += "\\\""; + } else if (c == '\\') { + escaped += "\\\\"; + } else if (c <= 0x1f) { + char buf[8]; + snprintf(buf, sizeof(buf), "\\u%04x", c); + escaped += buf; + } else { + escaped += c; + } + } + return escaped; + } + + static std::string format_value(const std::string & field, const std::string & value) { + switch (test::get_field_type(field)) { + case test::STRING: + return "\"" + escape_json(value) + "\""; + case test::BOOL: + return value == "0" ? "false" : "true"; + default: + return value; + } + } + void print_header(const cmd_params & params) override { fprintf(fout, "[\n"); (void) params; @@ -1139,8 +1007,7 @@ struct json_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), - format_json_value(fields.at(i), values.at(i)).c_str()); + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); } } @@ -1158,24 +1025,8 @@ struct json_printer : public printer { fflush(fout); } - void print_footer() override { fprintf(fout, "\n]\n"); } -}; - -struct jsonl_printer : public printer { - void print_fields(const std::vector & fields, const std::vector & values) { - assert(fields.size() == values.size()); - for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str()); - } - } - - void print_test(const test & t) override { - fprintf(fout, "{"); - print_fields(test::get_fields(), t.get_values()); - fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str()); - fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str()); - fprintf(fout, "}\n"); - fflush(fout); + void print_footer() override { + fprintf(fout, "\n]\n"); } }; @@ -1187,7 +1038,7 @@ struct markdown_printer : public printer { return -30; } if (field == "t/s") { - return 20; + return 16; } if (field == "size" || field == "params") { return 10; @@ -1220,7 +1071,7 @@ struct markdown_printer : public printer { return 13; } - int width = std::max((int) field.length(), 10); + int width = std::max((int)field.length(), 10); if (test::get_field_type(field) == test::STRING) { return -width; @@ -1262,23 +1113,13 @@ struct markdown_printer : public printer { fields.emplace_back("size"); fields.emplace_back("params"); fields.emplace_back("backend"); - bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos || - test::get_backend().find("BLAS") != std::string::npos; + bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; if (!is_cpu_backend) { fields.emplace_back("n_gpu_layers"); } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { fields.emplace_back("n_threads"); } - if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) { - fields.emplace_back("cpu_mask"); - } - if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) { - fields.emplace_back("cpu_strict"); - } - if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) { - fields.emplace_back("poll"); - } if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { fields.emplace_back("n_batch"); } @@ -1334,18 +1175,18 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; - char buf[128]; + char buf[128]; if (field == "model") { value = t.model_type; } else if (field == "size") { - if (t.model_size < 1024 * 1024 * 1024) { + if (t.model_size < 1024*1024*1024) { snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); } else { snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); } value = buf; } else if (field == "params") { - if (t.model_n_params < 1000 * 1000 * 1000) { + if (t.model_n_params < 1000*1000*1000) { snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); } else { snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); @@ -1353,6 +1194,9 @@ struct markdown_printer : public printer { value = buf; } else if (field == "backend") { value = test::get_backend(); + if (t.has_rpc) { + value += "+RPC"; + } } else if (field == "test") { if (t.n_prompt > 0 && t.n_gen == 0) { snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); @@ -1407,8 +1251,7 @@ struct sql_printer : public printer { std::vector fields = test::get_fields(); fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), - i < fields.size() - 1 ? "," : ""); + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); } fprintf(fout, ");\n"); fprintf(fout, "\n"); @@ -1426,12 +1269,11 @@ struct sql_printer : public printer { } }; -static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { +static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - const int32_t n_vocab = llama_vocab_n_tokens(vocab); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); std::vector tokens(n_batch); @@ -1439,28 +1281,27 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } - llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens)); + llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } llama_synchronize(ctx); } -static void test_gen(llama_context * ctx, int n_gen, int n_threads) { +static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - const int32_t n_vocab = llama_vocab_n_tokens(vocab); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); - llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; + llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { - llama_decode(ctx, llama_batch_get_one(&token, 1)); + llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); llama_synchronize(ctx); token = std::rand() % n_vocab; } @@ -1480,14 +1321,12 @@ static std::unique_ptr create_printer(output_formats format) { return std::unique_ptr(new csv_printer()); case JSON: return std::unique_ptr(new json_printer()); - case JSONL: - return std::unique_ptr(new jsonl_printer()); case MARKDOWN: return std::unique_ptr(new markdown_printer()); case SQL: return std::unique_ptr(new sql_printer()); } - GGML_ABORT("fatal error"); + GGML_ASSERT(false); } int main(int argc, char ** argv) { @@ -1508,17 +1347,6 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); - // initialize backends - ggml_backend_load_all(); - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (!cpu_dev) { - fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); - return 1; - } - auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); - auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); - auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); - // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1526,10 +1354,8 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - set_process_priority(params.prio); - // initialize printer - std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); if (p) { @@ -1544,23 +1370,17 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); - llama_model * lmodel = nullptr; + llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; - int params_idx = 0; - auto params_count = params_instances.size(); for (const auto & inst : params_instances) { - params_idx++; - if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); - } // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (lmodel) { - llama_model_free(lmodel); + llama_free_model(lmodel); } - lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); + lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1; @@ -1568,10 +1388,10 @@ int main(int argc, char ** argv) { prev_inst = &inst; } - llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); + llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); - llama_model_free(lmodel); + llama_free_model(lmodel); return 1; } @@ -1579,41 +1399,13 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); - // cool off before the test - if (params.delay) { - std::this_thread::sleep_for(std::chrono::seconds(params.delay)); - } - - struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); - if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) { - fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); - exit(1); - } - tpp.strict_cpu = t.cpu_strict; - tpp.poll = t.poll; - tpp.prio = params.prio; - - struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); - if (!threadpool) { - fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - exit(1); - } - - llama_attach_threadpool(ctx, threadpool, NULL); - // warmup run if (t.n_prompt > 0) { - if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); - } //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); - test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); } if (t.n_gen > 0) { - if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); - } - test_gen(ctx, 1, t.n_threads); + test_gen(ctx, 1, 0, t.n_threads); } for (int i = 0; i < params.reps; i++) { @@ -1622,18 +1414,10 @@ int main(int argc, char ** argv) { uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { - if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count, - i + 1, params.reps); - } - test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); } if (t.n_gen > 0) { - if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, - i + 1, params.reps); - } - test_gen(ctx, t.n_gen, t.n_threads); + test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); } uint64_t t_ns = get_time_ns() - t_start; @@ -1650,14 +1434,12 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_context_print(ctx); + llama_print_timings(ctx); llama_free(ctx); - - ggml_threadpool_free_fn(threadpool); } - llama_model_free(lmodel); + llama_free_model(lmodel); if (p) { p->print_footer(); diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts index 28dbc1904..0a3806172 100644 --- a/examples/llama.android/llama/build.gradle.kts +++ b/examples/llama.android/llama/build.gradle.kts @@ -18,8 +18,6 @@ android { } externalNativeBuild { cmake { - arguments += "-DLLAMA_BUILD_COMMON=ON" - arguments += "-DGGML_LLAMAFILE=OFF" arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 2a73983a9..2aafe2316 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi auto path_to_model = env->GetStringUTFChars(filename, 0); LOGi("Loading model from %s", path_to_model); - auto model = llama_model_load_from_file(path_to_model, model_params); + auto model = llama_load_model_from_file(path_to_model, model_params); env->ReleaseStringUTFChars(filename, path_to_model); if (!model) { @@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) { - llama_model_free(reinterpret_cast(model)); + llama_free_model(reinterpret_cast(model)); } extern "C" @@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo LOGi("Using %d threads", n_threads); llama_context_params ctx_params = llama_context_default_params(); - - ctx_params.n_ctx = 2048; + ctx_params.seed = 1234; + ctx_params.n_ctx = 2048; ctx_params.n_threads = n_threads; ctx_params.n_threads_batch = n_threads; @@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( for (nri = 0; nri < nr; nri++) { LOGi("Benchmark prompt processing (pp)"); - common_batch_clear(*batch); + llama_batch_clear(*batch); const int n_tokens = pp; for (i = 0; i < n_tokens; i++) { - common_batch_add(*batch, 0, i, { 0 }, false); + llama_batch_add(*batch, 0, i, { 0 }, false); } batch->logits[batch->n_tokens - 1] = true; @@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { - common_batch_clear(*batch); + llama_batch_clear(*batch); for (j = 0; j < pl; j++) { - common_batch_add(*batch, 0, i, { j }, true); + llama_batch_add(*batch, 0, i, { j }, true); } LOGi("llama_decode() text generation: %d", i); @@ -269,6 +269,12 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( return env->NewStringUTF(result.str().c_str()); } +extern "C" +JNIEXPORT void JNICALL +Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + extern "C" JNIEXPORT jlong JNICALL Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { @@ -283,6 +289,9 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, nullptr, nullptr, nullptr, + 0, + 0, + 0, }; if (embd) { @@ -302,31 +311,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, return reinterpret_cast(batch); } -extern "C" -JNIEXPORT void JNICALL -Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { - //llama_batch_free(*reinterpret_cast(batch_pointer)); - const auto batch = reinterpret_cast(batch_pointer); - delete batch; -} - -extern "C" -JNIEXPORT jlong JNICALL -Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) { - auto sparams = llama_sampler_chain_default_params(); - sparams.no_perf = true; - llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); - - return reinterpret_cast(smpl); -} - -extern "C" -JNIEXPORT void JNICALL -Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) { - llama_sampler_free(reinterpret_cast(sampler_pointer)); -} - extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) { @@ -347,7 +331,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( jlong context_pointer, jlong batch_pointer, jstring jtext, - jboolean format_chat, jint n_len ) { @@ -357,8 +340,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( const auto context = reinterpret_cast(context_pointer); const auto batch = reinterpret_cast(batch_pointer); - bool parse_special = (format_chat == JNI_TRUE); - const auto tokens_list = common_tokenize(context, text, true, parse_special); + const auto tokens_list = llama_tokenize(context, text, 1); auto n_ctx = llama_n_ctx(context); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); @@ -370,14 +352,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( } for (auto id : tokens_list) { - LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id); + LOGi("%s", llama_token_to_piece(context, id).c_str()); } - common_batch_clear(*batch); + llama_batch_clear(*batch); // evaluate the initial prompt for (auto i = 0; i < tokens_list.size(); i++) { - common_batch_add(*batch, tokens_list[i], i, { 0 }, false); + llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); } // llama_decode will output logits only for the last token of the prompt @@ -399,29 +381,38 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( jobject, jlong context_pointer, jlong batch_pointer, - jlong sampler_pointer, jint n_len, jobject intvar_ncur ) { const auto context = reinterpret_cast(context_pointer); - const auto batch = reinterpret_cast(batch_pointer); - const auto sampler = reinterpret_cast(sampler_pointer); + const auto batch = reinterpret_cast(batch_pointer); const auto model = llama_get_model(context); - const auto vocab = llama_model_get_vocab(model); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); + auto n_vocab = llama_n_vocab(model); + auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + // sample the most likely token - const auto new_token_id = llama_sampler_sample(sampler, context, -1); + const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); - if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { return nullptr; } - auto new_token_chars = common_token_to_piece(context, new_token_id); + auto new_token_chars = llama_token_to_piece(context, new_token_id); cached_token_chars += new_token_chars; jstring new_token = nullptr; @@ -433,8 +424,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( new_token = env->NewStringUTF(""); } - common_batch_clear(*batch); - common_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + llama_batch_clear(*batch); + llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); env->CallVoidMethod(intvar_ncur, la_int_var_inc); diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt index b964d93e3..6c63e54e0 100644 --- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt @@ -45,10 +45,8 @@ class LLamaAndroid { private external fun free_context(context: Long) private external fun backend_init(numa: Boolean) private external fun backend_free() - private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long private external fun free_batch(batch: Long) - private external fun new_sampler(): Long - private external fun free_sampler(sampler: Long) + private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long private external fun bench_model( context: Long, model: Long, @@ -65,14 +63,12 @@ class LLamaAndroid { context: Long, batch: Long, text: String, - formatChat: Boolean, nLen: Int ): Int private external fun completion_loop( context: Long, batch: Long, - sampler: Long, nLen: Int, ncur: IntVar ): String? @@ -105,23 +101,20 @@ class LLamaAndroid { val batch = new_batch(512, 0, 1) if (batch == 0L) throw IllegalStateException("new_batch() failed") - val sampler = new_sampler() - if (sampler == 0L) throw IllegalStateException("new_sampler() failed") - Log.i(tag, "Loaded model $pathToModel") - threadLocalState.set(State.Loaded(model, context, batch, sampler)) + threadLocalState.set(State.Loaded(model, context, batch)) } else -> throw IllegalStateException("Model already loaded") } } } - fun send(message: String, formatChat: Boolean = false): Flow = flow { + fun send(message: String): Flow = flow { when (val state = threadLocalState.get()) { is State.Loaded -> { - val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen)) + val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) while (ncur.value <= nlen) { - val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur) + val str = completion_loop(state.context, state.batch, nlen, ncur) if (str == null) { break } @@ -145,7 +138,6 @@ class LLamaAndroid { free_context(state.context) free_model(state.model) free_batch(state.batch) - free_sampler(state.sampler); threadLocalState.set(State.Idle) } @@ -169,7 +161,7 @@ class LLamaAndroid { private sealed interface State { data object Idle: State - data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State + data class Loaded(val model: Long, val context: Long, val batch: Long): State } // Enforce only one instance of Llm. diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index ee7141a66..58c32ca53 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -24,8 +24,6 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama actor LlamaContext { private var model: OpaquePointer private var context: OpaquePointer - private var vocab: OpaquePointer - private var sampling: UnsafeMutablePointer private var batch: llama_batch private var tokens_list: [llama_token] var is_done: Bool = false @@ -44,18 +42,12 @@ actor LlamaContext { self.tokens_list = [] self.batch = llama_batch_init(512, 0, 1) self.temporary_invalid_cchars = [] - let sparams = llama_sampler_chain_default_params() - self.sampling = llama_sampler_chain_init(sparams) - llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4)) - llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) - vocab = llama_model_get_vocab(model) } deinit { - llama_sampler_free(sampling) llama_batch_free(batch) - llama_model_free(model) llama_free(context) + llama_free_model(model) llama_backend_free() } @@ -67,7 +59,7 @@ actor LlamaContext { model_params.n_gpu_layers = 0 print("Running on simulator, force use n_gpu_layers = 0") #endif - let model = llama_model_load_from_file(path, model_params) + let model = llama_load_model_from_file(path, model_params) guard let model else { print("Could not load model at \(path)") throw LlamaError.couldNotInitializeContext @@ -77,11 +69,12 @@ actor LlamaContext { print("Using \(n_threads) threads") var ctx_params = llama_context_default_params() + ctx_params.seed = 1234 ctx_params.n_ctx = 2048 - ctx_params.n_threads = Int32(n_threads) - ctx_params.n_threads_batch = Int32(n_threads) + ctx_params.n_threads = UInt32(n_threads) + ctx_params.n_threads_batch = UInt32(n_threads) - let context = llama_init_from_model(model, ctx_params) + let context = llama_new_context_with_model(model, ctx_params) guard let context else { print("Could not load context!") throw LlamaError.couldNotInitializeContext @@ -151,9 +144,22 @@ actor LlamaContext { func completion_loop() -> String { var new_token_id: llama_token = 0 - new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) + let n_vocab = llama_n_vocab(model) + let logits = llama_get_logits_ith(context, batch.n_tokens - 1) - if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len { + var candidates = Array() + candidates.reserveCapacity(Int(n_vocab)) + + for token_id in 0...allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) + let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) var swiftTokens: [llama_token] = [] for i in 0...allocate(capacity: Int(-nTokens)) @@ -326,7 +332,7 @@ actor LlamaContext { defer { newResult.deallocate() } - let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) return Array(bufferPointer) } else { diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index ff3d108b2..3950b9e9d 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -7,7 +7,6 @@ objects = { /* Begin PBXBuildFile section */ - 1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; }; 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; @@ -18,6 +17,7 @@ 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; + DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; }; F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; }; /* End PBXBuildFile section */ @@ -42,7 +42,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 1809696D2D05A39F00400EE8 /* llama in Frameworks */, + DF810E132B4A5BA200301144 /* llama in Frameworks */, 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, ); @@ -151,7 +151,7 @@ ); name = llama.swiftui; packageProductDependencies = ( - 1809696C2D05A39F00400EE8 /* llama */, + DF810E122B4A5BA200301144 /* llama */, ); productName = llama.swiftui; productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; @@ -429,7 +429,7 @@ /* End XCConfigurationList section */ /* Begin XCSwiftPackageProductDependency section */ - 1809696C2D05A39F00400EE8 /* llama */ = { + DF810E122B4A5BA200301144 /* llama */ = { isa = XCSwiftPackageProductDependency; productName = llama; }; diff --git a/examples/llama.vim b/examples/llama.vim index 57eb2a977..1b5ad6ba0 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -1,783 +1,135 @@ -" LLM-based text completion using llama.cpp +" Requires an already running llama.cpp server +" To install either copy or symlink to ~/.vim/autoload/llama.vim +" Then start with either :call llama#doLlamaGen(), +" or add a keybind to your vimrc such as +" nnoremap Z :call llama#doLlamaGen() +" Similarly, you could add an insert mode keybind with +" inoremap call llama#doLlamaGen() " -" requires: -" -" - neovim or vim -" - curl -" - llama.cpp server instance -" - FIM-compatible model -" -" sample config: -" -" - Tab - accept the current suggestion -" - Shift+Tab - accept just the first line of the suggestion -" - Ctrl+F - toggle FIM completion manually -" -" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim -" -" start the llama.cpp server with a FIM-compatible model. for example: -" -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 -" -" --batch-size [512, model max context] -" -" adjust the batch size to control how much of the provided local context will be used during the inference -" lower values will use smaller part of the context around the cursor, which will result in faster processing -" -" --ubatch-size [64, 2048] -" -" chunks the batch into smaller chunks for faster processing -" depends on the specific hardware. use llama-bench to profile and determine the best size -" -" --cache-reuse (ge:llama_config.n_predict, 1024] -" -" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict -" using non-zero value enables context reuse on the server side which dramatically improves the performance at -" large contexts. a value of 256 should be good for all cases -" -" run this once to initialise llama.vim: -" -" :call llama#init() -" -" more info: https://github.com/ggerganov/llama.cpp/pull/9787 +" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc +" let g:llama_api_url = "192.168.1.10:8080" +" llama_overrides can also be set through buffer/window scopes. For instance +" autocmd filetype python let b:llama_overrides = {"temp": 0.2} +" Could be added to your .vimrc to automatically set a lower temperature when +" editing a python script +" Additionally, an override dict can be stored at the top of a file +" !*{"stop": ["User:"]} +" Could be added to the start of your chatlog.txt to set the stopping token +" These parameter dicts are merged together from lowest to highest priority: +" server default -> g:llama_overrides -> w:llama_overrides -> +" b:llama_overrides -> in file (!*) overrides " +" Sublists (like logit_bias and stop) are overridden, not merged +" Example override: +" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} +if !exists("g:llama_api_url") + let g:llama_api_url= "127.0.0.1:8080" +endif +if !exists("g:llama_overrides") + let g:llama_overrides = {} +endif +const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } +const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] +let s:linedict = {} -" colors (adjust to your liking) -highlight llama_hl_hint guifg=#ff772f ctermfg=202 -highlight llama_hl_info guifg=#77ff2f ctermfg=119 - -" general parameters: -" -" endpoint: llama.cpp server endpoint -" n_prefix: number of lines before the cursor location to include in the local prefix -" n_suffix: number of lines after the cursor location to include in the local suffix -" n_predict: max number of tokens to predict -" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) -" t_max_predict_ms: max alloted time for the prediction -" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) -" auto_fim: trigger FIM completion automatically on cursor movement -" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor -" -" ring buffer of chunks, accumulated with time upon: -" -" - completion request -" - yank -" - entering a buffer -" - leaving a buffer -" - writing a file -" -" parameters for the ring-buffer with extra context: -" -" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) -" ring_chunk_size: max size of the chunks (in number of lines) -" note: adjust these numbers so that you don't overrun your context -" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context -" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM -" ring_update_ms: how often to process queued chunks in normal mode -" -let s:default_config = { - \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 256, - \ 'n_suffix': 64, - \ 'n_predict': 128, - \ 't_max_prompt_ms': 500, - \ 't_max_predict_ms': 3000, - \ 'show_info': 2, - \ 'auto_fim': v:true, - \ 'max_line_suffix': 8, - \ 'ring_n_chunks': 64, - \ 'ring_chunk_size': 64, - \ 'ring_scope': 1024, - \ 'ring_update_ms': 1000, - \ } - -let g:llama_config = get(g:, 'llama_config', s:default_config) - -function! s:get_indent(str) - let l:count = 0 - for i in range(len(a:str)) - if a:str[i] == "\t" - let l:count += &tabstop - 1 - else - break - endif - endfor - return l:count +func s:callbackHandler(bufn, channel, msg) + if len(a:msg) < 3 + return + elseif a:msg[0] == "d" + let l:msg = a:msg[6:-1] + else + let l:msg = a:msg + endif + let l:decoded_msg = json_decode(l:msg) + let l:newtext = split(l:decoded_msg['content'], "\n", 1) + if len(l:newtext) > 0 + call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) + else + echo "nothing genned" + endif + if len(newtext) > 1 + let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) + let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 + endif + if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop + echo "Finished generation" + endif endfunction -function! s:rand(i0, i1) abort - return a:i0 + rand() % (a:i1 - a:i0 + 1) +func llama#doLlamaGen() + if exists("b:job") + if job_status(b:job) == "run" + call job_stop(b:job) + return + endif + endif + + let l:cbuffer = bufnr("%") + let s:linedict[l:cbuffer] = line('$') + let l:buflines = getbufline(l:cbuffer, 1, 1000) + let l:querydata = copy(s:querydata) + call extend(l:querydata, g:llama_overrides) + if exists("w:llama_overrides") + call extend(l:querydata, w:llama_overrides) + endif + if exists("b:llama_overrides") + call extend(l:querydata, b:llama_overrides) + endif + if l:buflines[0][0:1] == '!*' + let l:userdata = json_decode(l:buflines[0][2:-1]) + call extend(l:querydata, l:userdata) + let l:buflines = l:buflines[1:-1] + endif + let l:querydata.prompt = join(l:buflines, "\n") + let l:curlcommand = copy(s:curlcommand) + if exists("g:llama_api_key") + call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key]) + endif + let l:curlcommand[2] = json_encode(l:querydata) + let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) endfunction -function! llama#init() - if !executable('curl') - echohl WarningMsg - echo 'llama.vim requires the "curl" command to be available' - echohl None - return - endif - - let s:pos_x = 0 " cursor position upon start of completion - let s:pos_y = 0 - - let s:line_cur = '' - - let s:line_cur_prefix = '' - let s:line_cur_suffix = '' - - let s:ring_chunks = [] " current set of chunks used as extra context - let s:ring_queued = [] " chunks that are queued to be sent for processing - let s:ring_n_evict = 0 - - let s:hint_shown = v:false - let s:pos_y_pick = -9999 " last y where we picked a chunk - let s:pos_dx = 0 - let s:content = [] - let s:can_accept = v:false - - let s:timer_fim = -1 - let s:t_fim_start = reltime() " used to measure total FIM time - let s:t_last_move = reltime() " last time the cursor moved - - let s:current_job = v:null - - let s:ghost_text_nvim = exists('*nvim_buf_get_mark') - let s:ghost_text_vim = has('textprop') - - if s:ghost_text_vim - let s:hlgroup_hint = 'llama_hl_hint' - let s:hlgroup_info = 'llama_hl_info' - - if empty(prop_type_get(s:hlgroup_hint)) - call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint}) - endif - if empty(prop_type_get(s:hlgroup_info)) - call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info}) - endif - endif - - augroup llama - autocmd! - autocmd InsertEnter * inoremap llama#fim_inline(v:false) - autocmd InsertLeavePre * call llama#fim_cancel() - - autocmd CursorMoved * call s:on_move() - autocmd CursorMovedI * call s:on_move() - autocmd CompleteChanged * call llama#fim_cancel() - - if g:llama_config.auto_fim - autocmd CursorMovedI * call llama#fim(v:true) - endif - - " gather chunks upon yanking - autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif - - " gather chunks upon entering/leaving a buffer - autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) - autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) - - " gather chunk upon saving the file - autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) - augroup END - - silent! call llama#fim_cancel() - - " init background update of the ring buffer - if g:llama_config.ring_n_chunks > 0 - call s:ring_update() - endif -endfunction - -" compute how similar two chunks of text are -" 0 - no similarity, 1 - high similarity -" TODO: figure out something better -function! s:chunk_sim(c0, c1) - let l:lines0 = len(a:c0) - let l:lines1 = len(a:c1) - - let l:common = 0 - - for l:line0 in a:c0 - for l:line1 in a:c1 - if l:line0 == l:line1 - let l:common += 1 - break - endif - endfor - endfor - - return 2.0 * l:common / (l:lines0 + l:lines1) -endfunction - -" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing -" -" no_mod - do not pick chunks from buffers with pending changes -" do_evict - evict chunks that are very similar to the new one -" -function! s:pick_chunk(text, no_mod, do_evict) - " do not pick chunks from buffers with pending changes or buffers that are not files - if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) - return - endif - - " if the extra context option is disabled - do nothing - if g:llama_config.ring_n_chunks <= 0 - return - endif - - " don't pick very small chunks - if len(a:text) < 3 - return - endif - - if len(a:text) + 1 < g:llama_config.ring_chunk_size - let l:chunk = a:text +" Echos the tokkenization of the provided string , or cursor to end of word +" Onus is placed on the user to include the preceding space +func llama#tokenizeWord(...) + if (a:0 > 0) + let l:input = a:1 else - let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2])) - let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)]) - - let l:chunk = a:text[l:l0:l:l1] + exe "normal \"*ye" + let l:input = @* endif - - let l:chunk_str = join(l:chunk, "\n") . "\n" - - " check if this chunk is already added - let l:exist = v:false - - for i in range(len(s:ring_chunks)) - if s:ring_chunks[i].data == l:chunk - let l:exist = v:true - break - endif - endfor - - for i in range(len(s:ring_queued)) - if s:ring_queued[i].data == l:chunk - let l:exist = v:true - break - endif - endfor - - if l:exist - return - endif - - " evict queued chunks that are very similar to the new one - for i in range(len(s:ring_queued) - 1, 0, -1) - if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9 - if a:do_evict - call remove(s:ring_queued, i) - let s:ring_n_evict += 1 - else - return - endif - endif - endfor - - " also from s:ring_chunks - for i in range(len(s:ring_chunks) - 1, 0, -1) - if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 - if a:do_evict - call remove(s:ring_chunks, i) - let s:ring_n_evict += 1 - else - return - endif - endif - endfor - - " TODO: become parameter ? - if len(s:ring_queued) == 16 - call remove(s:ring_queued, 0) - endif - - call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) - - "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) + let l:querydata = {"content": l:input} + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" + let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) endfunction -" picks a queued chunk, sends it for processing and adds it to s:ring_chunks -" called every g:llama_config.ring_update_ms -function! s:ring_update() - call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) - - " update only if in normal mode or if the cursor hasn't moved for a while - if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0 - return - endif - - if len(s:ring_queued) == 0 - return - endif - - " move the first queued chunk to the ring buffer - if len(s:ring_chunks) == g:llama_config.ring_n_chunks - call remove(s:ring_chunks, 0) - endif - - call add(s:ring_chunks, remove(s:ring_queued, 0)) - - "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) - - " send asynchronous job with the new extra context so that it is ready for the next FIM - let l:extra_context = [] - for l:chunk in s:ring_chunks - call add(l:extra_context, { - \ 'text': l:chunk.str, - \ 'time': l:chunk.time, - \ 'filename': l:chunk.filename - \ }) - endfor - - " no samplers needed here - let l:request = json_encode({ - \ 'input_prefix': "", - \ 'input_suffix': "", - \ 'input_extra': l:extra_context, - \ 'prompt': "", - \ 'n_predict': 1, - \ 'temperature': 0.0, - \ 'stream': v:false, - \ 'samplers': ["temperature"], - \ 'cache_prompt': v:true, - \ 't_max_prompt_ms': 1, - \ 't_max_predict_ms': 1 - \ }) - - let l:curl_command = [ - \ "curl", - \ "--silent", - \ "--no-buffer", - \ "--request", "POST", - \ "--url", g:llama_config.endpoint, - \ "--header", "Content-Type: application/json", - \ "--data", l:request - \ ] - - " no callbacks because we don't need to process the response - if s:ghost_text_nvim - call jobstart(l:curl_command, {}) - elseif s:ghost_text_vim - call job_start(l:curl_command, {}) - endif +func s:tokenizeWordCallback(plaintext, channel, msg) + echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) endfunction -" necessary for 'inoremap ' -function! llama#fim_inline(is_auto) abort - call llama#fim(a:is_auto) - return '' + +" Echos the token count of the entire buffer (or provided string) +" Example usage :echo llama#tokenCount() +func llama#tokenCount(...) + if (a:0 > 0) + let l:buflines = a:1 + else + let l:buflines = getline(1,1000) + if l:buflines[0][0:1] == '!*' + let l:buflines = l:buflines[1:-1] + endif + let l:buflines = join(l:buflines, "\n") + endif + let l:querydata = {"content": l:buflines} + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" + let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) endfunction -" the main FIM call -" takes local context around the cursor and sends it together with the extra context to the server for completion -function! llama#fim(is_auto) abort - " we already have a suggestion for the current cursor position - if s:hint_shown && !a:is_auto - call llama#fim_cancel() - return - endif - - call llama#fim_cancel() - - " avoid sending repeated requests too fast - if reltimefloat(reltime(s:t_fim_start)) < 0.6 - if s:timer_fim != -1 - call timer_stop(s:timer_fim) - let s:timer_fim = -1 - endif - - let s:t_fim_start = reltime() - let s:timer_fim = timer_start(600, {-> llama#fim(v:true)}) - return - endif - - let s:t_fim_start = reltime() - - let s:content = [] - let s:can_accept = v:false - - let s:pos_x = col('.') - 1 - let s:pos_y = line('.') - let l:max_y = line('$') - - let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1) - let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix])) - - let s:line_cur = getline('.') - - let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x) - let s:line_cur_suffix = strpart(s:line_cur, s:pos_x) - - if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix - return - endif - - let l:prefix = "" - \ . join(l:lines_prefix, "\n") - \ . "\n" - - let l:prompt = "" - \ . s:line_cur_prefix - - let l:suffix = "" - \ . s:line_cur_suffix - \ . "\n" - \ . join(l:lines_suffix, "\n") - \ . "\n" - - " prepare the extra context data - let l:extra_context = [] - for l:chunk in s:ring_chunks - call add(l:extra_context, { - \ 'text': l:chunk.str, - \ 'time': l:chunk.time, - \ 'filename': l:chunk.filename - \ }) - endfor - - " the indentation of the current line - let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) - - let l:request = json_encode({ - \ 'input_prefix': l:prefix, - \ 'input_suffix': l:suffix, - \ 'input_extra': l:extra_context, - \ 'prompt': l:prompt, - \ 'n_predict': g:llama_config.n_predict, - \ 'n_indent': l:indent, - \ 'top_k': 40, - \ 'top_p': 0.99, - \ 'stream': v:false, - \ 'samplers': ["top_k", "top_p", "infill"], - \ 'cache_prompt': v:true, - \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, - \ 't_max_predict_ms': g:llama_config.t_max_predict_ms - \ }) - - let l:curl_command = [ - \ "curl", - \ "--silent", - \ "--no-buffer", - \ "--request", "POST", - \ "--url", g:llama_config.endpoint, - \ "--header", "Content-Type: application/json", - \ "--data", l:request - \ ] - - if s:current_job != v:null - if s:ghost_text_nvim - call jobstop(s:current_job) - elseif s:ghost_text_vim - call job_stop(s:current_job) - endif - endif - - " send the request asynchronously - if s:ghost_text_nvim - let s:current_job = jobstart(l:curl_command, { - \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]), - \ 'on_exit': function('s:fim_on_exit'), - \ 'stdout_buffered': v:true - \ }) - elseif s:ghost_text_vim - let s:current_job = job_start(l:curl_command, { - \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]), - \ 'exit_cb': function('s:fim_on_exit') - \ }) - endif - - " TODO: per-file location - let l:delta_y = abs(s:pos_y - s:pos_y_pick) - - " gather some extra context nearby and process it in the background - " only gather chunks if the cursor has moved a lot - " TODO: something more clever? reranking? - if a:is_auto && l:delta_y > 32 - " expand the prefix even further - call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) - - " pick a suffix chunk - call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) - - let s:pos_y_pick = s:pos_y - endif -endfunction - -" if first_line == v:true accept only the first line of the response -function! llama#fim_accept(first_line) - " insert the suggestion at the cursor location - if s:can_accept && len(s:content) > 0 - call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0]) - if len(s:content) > 1 - if !a:first_line - call append(s:pos_y, s:content[1:-1]) - endif - endif - - " move the cursor to the end of the accepted text - if !a:first_line && len(s:content) > 1 - call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1) - else - call cursor(s:pos_y, s:pos_x + len(s:content[0])) - endif - endif - - call llama#fim_cancel() -endfunction - -function! llama#fim_cancel() - let s:hint_shown = v:false - - " clear the virtual text - let l:bufnr = bufnr('%') - - if s:ghost_text_nvim - let l:id_vt_fim = nvim_create_namespace('vt_fim') - call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) - elseif s:ghost_text_vim - call prop_remove({'type': s:hlgroup_hint, 'all': v:true}) - call prop_remove({'type': s:hlgroup_info, 'all': v:true}) - endif - - " remove the mappings - silent! iunmap - silent! iunmap - silent! iunmap -endfunction - -function! s:on_move() - let s:t_last_move = reltime() - - call llama#fim_cancel() -endfunction - -" callback that processes the FIM result from the server and displays the suggestion -function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null) - if s:ghost_text_nvim - let l:raw = join(a:data, "\n") - elseif s:ghost_text_vim - let l:raw = a:data - endif - - if len(l:raw) == 0 - return - endif - - if a:pos_x != col('.') - 1 || a:pos_y != line('.') - return - endif - - " show the suggestion only in insert mode - if mode() !=# 'i' - return - endif - - let s:pos_x = a:pos_x - let s:pos_y = a:pos_y - - let s:can_accept = v:true - let l:has_info = v:false - - if s:can_accept && v:shell_error - if !a:is_auto - call add(s:content, "<| curl error: is the server on? |>") - endif - let s:can_accept = v:false - endif - - let l:n_prompt = 0 - let l:t_prompt_ms = 1.0 - let l:s_prompt = 0 - - let l:n_predict = 0 - let l:t_predict_ms = 1.0 - let l:s_predict = 0 - - " get the generated suggestion - if s:can_accept - let l:response = json_decode(l:raw) - - for l:part in split(get(l:response, 'content', ''), "\n", 1) - call add(s:content, l:part) - endfor - - " remove trailing new lines - while len(s:content) > 0 && s:content[-1] == "" - call remove(s:content, -1) - endwhile - - let l:generation_settings = get(l:response, 'generation_settings', {}) - let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) - - let l:n_cached = get(l:response, 'tokens_cached', 0) - let l:truncated = get(l:response, 'truncated', v:false) - - " if response.timings is available - if len(get(l:response, 'timings', {})) > 0 - let l:has_info = v:true - let l:timings = get(l:response, 'timings', {}) - - let l:n_prompt = get(l:timings, 'prompt_n', 0) - let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) - let l:s_prompt = get(l:timings, 'prompt_per_second', 0) - - let l:n_predict = get(l:timings, 'predicted_n', 0) - let l:t_predict_ms = get(l:timings, 'predicted_ms', 1) - let l:s_predict = get(l:timings, 'predicted_per_second', 0) - endif - endif - - if len(s:content) == 0 - call add(s:content, "") - let s:can_accept = v:false - endif - - if len(s:content) == 0 - return - endif - - " NOTE: the following is logic for discarding predictions that repeat existing text - " the code is quite ugly and there is very likely a simpler and more canonical way to implement this - " - " still, I wonder if there is some better way that avoids having to do these special hacks? - " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would - " start generating whatever we have given it via the extra context. but on the other hand, it's not very - " helpful to re-generate the same code that is already there - - " truncate the suggestion if the first line is empty - if len(s:content) == 1 && s:content[0] == "" - let s:content = [""] - endif - - " ... and the next lines are repeated - if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1) - let s:content = [""] - endif - - " truncate the suggestion if it repeats the suffix - if len(s:content) == 1 && s:content[0] == s:line_cur_suffix - let s:content = [""] - endif - - " find the first non-empty line (strip whitespace) - let l:cmp_y = s:pos_y + 1 - while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$' - let l:cmp_y += 1 - endwhile - - if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y) - " truncate the suggestion if it repeats the next line - if len(s:content) == 1 - let s:content = [""] - endif - - " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1 - if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1] - let s:content = [""] - endif - - " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1) - if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n") - let s:content = [""] - endif - endif - - " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix - "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) - "for i in range(1, len(s:content) - 1) - " if strlen(matchstr(s:content[i], '^\s*')) < l:indent - " let s:content = s:content[:i - 1] - " break - " endif - "endfor - - let s:pos_dx = len(s:content[-1]) - - let s:content[-1] .= s:line_cur_suffix - - call llama#fim_cancel() - - " display virtual text with the suggestion - let l:bufnr = bufnr('%') - - if s:ghost_text_nvim - let l:id_vt_fim = nvim_create_namespace('vt_fim') - endif - - " construct the info message - if g:llama_config.show_info > 0 && l:has_info - let l:prefix = ' ' - - if l:truncated - let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", - \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx - \ ) - else - let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", - \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), - \ l:n_prompt, l:t_prompt_ms, l:s_prompt, - \ l:n_predict, l:t_predict_ms, l:s_predict, - \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) - \ ) - endif - - if g:llama_config.show_info == 1 - " display the info in the statusline - let &statusline = l:info - let l:info = '' - endif - endif - - " display the suggestion and append the info to the end of the first line - if s:ghost_text_nvim - call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { - \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']], - \ 'virt_text_win_col': virtcol('.') - 1 - \ }) - - call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, { - \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), - \ 'virt_text_win_col': virtcol('.') - \ }) - elseif s:ghost_text_vim - let l:new_suffix = s:content[0] - if !empty(l:new_suffix) - call prop_add(s:pos_y, s:pos_x + 1, { - \ 'type': s:hlgroup_hint, - \ 'text': l:new_suffix - \ }) - endif - for line in s:content[1:] - call prop_add(s:pos_y, 0, { - \ 'type': s:hlgroup_hint, - \ 'text': line, - \ 'text_padding_left': s:get_indent(line), - \ 'text_align': 'below' - \ }) - endfor - if !empty(l:info) - call prop_add(s:pos_y, 0, { - \ 'type': s:hlgroup_info, - \ 'text': l:info, - \ 'text_padding_left': col('$'), - \ 'text_wrap': 'truncate' - \ }) - endif - endif - - " setup accept shortcuts - inoremap :call llama#fim_accept(v:false) - inoremap :call llama#fim_accept(v:true) - - let s:hint_shown = v:true -endfunction - -function! s:fim_on_exit(job_id, exit_code, event = v:null) - if a:exit_code != 0 - echom "Job failed with exit code: " . a:exit_code - endif - - let s:current_job = v:null +func s:tokenCountCallback(channel, msg) + let resp = json_decode(a:msg) + echo len(resp.tokens) endfunction diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 319effd19..e9fa73acb 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .) target_include_directories(llava PUBLIC ../..) target_include_directories(llava PUBLIC ../../common) -target_compile_features(llava PRIVATE cxx_std_17) +target_compile_features(llava PRIVATE cxx_std_11) add_library(llava_static STATIC $) if (BUILD_SHARED_LIBS) @@ -35,25 +35,4 @@ add_executable(${TARGET} llava-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -set(TARGET llama-minicpmv-cli) -add_executable(${TARGET} minicpmv-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -set(TARGET llama-qwen2vl-cli) -add_executable(${TARGET} qwen2vl-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -set(TARGET llama-llava-clip-quantize-cli) -add_executable(${TARGET} clip-quantize-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 4f783f3ce..06a65fba4 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert_image_encoder_to_gguf.py \ +python ./examples/llava/convert_image_encoder_to_gguf \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \ --output-dir path/to/MobileVLM-1.7B \ @@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \ ``` ```sh -python ./examples/llava/convert_image_encoder_to_gguf.py \ +python ./examples/llava/convert_image_encoder_to_gguf \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --output-dir path/to/MobileVLM-1.7B_V2 \ @@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \ 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown +python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B ``` -5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k` +5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` ```sh -./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ``` Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. diff --git a/examples/llava/README-glmedge.md b/examples/llava/README-glmedge.md deleted file mode 100644 index 603d01474..000000000 --- a/examples/llava/README-glmedge.md +++ /dev/null @@ -1,43 +0,0 @@ -# GLMV-EDGE - -Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b). - -## Usage -Build with cmake or run `make llama-llava-cli` to build it. - -After building, run: `./llama-llava-cli` to see the usage. For example: - -```sh -./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <|user|>\n prompt <|assistant|>\n" -``` - -**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. -**note**: For GPU offloading ensure to use the `-ngl` flag just like usual - -## GGUF conversion - -1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example: - -```sh -git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b -``` - -2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents: - -```sh -python ./examples/llava/glmedge-surgery.py -m ../model_path -``` - -4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF: - -```sh -python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path -``` - -5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF: - -```sh -python convert_hf_to_gguf.py ../model_path -``` - -Now both the LLM part and the image encoder are in the `model_path` directory. diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md deleted file mode 100644 index 8713a43d6..000000000 --- a/examples/llava/README-minicpmo2.6.md +++ /dev/null @@ -1,46 +0,0 @@ -## MiniCPM-o 2.6 -Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible. - -### Prepare models and code - -Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder. - -Clone llama.cpp: -```bash -git clone git@github.com:OpenBMB/llama.cpp.git -cd llama.cpp -git checkout minicpm-omni -``` - -### Usage of MiniCPM-o 2.6 - -Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us) - -```bash -python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6 -python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 -python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model - -# quantize int4 version -./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M -``` - -Build llama.cpp using `CMake`: -https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md - -```bash -cmake -B build -cmake --build build --config Release -``` - -Inference on Linux or Mac -``` -# run f16 version -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md deleted file mode 100644 index 1c8498ff9..000000000 --- a/examples/llava/README-minicpmv2.5.md +++ /dev/null @@ -1,99 +0,0 @@ -## MiniCPM-Llama3-V 2.5 - -### Prepare models and code - -Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. - -Clone llama.cpp: -```bash -git clone https://github.com/ggerganov/llama.cpp -cd llama.cpp -``` - -### Usage - -Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) - -```bash -python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 -python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 -python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model - -# quantize int4 version -./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M -``` - -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` - -Inference on Linux or Mac -``` -# run f16 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -``` diff --git a/examples/llava/README-minicpmv2.6.md b/examples/llava/README-minicpmv2.6.md deleted file mode 100644 index c4be5e5dd..000000000 --- a/examples/llava/README-minicpmv2.6.md +++ /dev/null @@ -1,107 +0,0 @@ -## MiniCPM-V 2.6 - -### Prepare models and code - -Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder. - -Clone llama.cpp: -```bash -git clone git@github.com:OpenBMB/llama.cpp.git -cd llama.cpp -git checkout minicpmv-main -``` - -### Usage of MiniCPM-V 2.6 - -Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) - -```bash -python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6 -python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3 -python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model - -# quantize int4 version -./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M -``` - -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` - -Inference on Linux or Mac -``` -# run f16 version -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Video -Install FFmpeg -``` -brew install ffmpeg -brew install pkg-config -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -``` diff --git a/examples/llava/README-quantize.md b/examples/llava/README-quantize.md deleted file mode 100644 index b931513ab..000000000 --- a/examples/llava/README-quantize.md +++ /dev/null @@ -1,44 +0,0 @@ -# Quantizing CLIP Visual Projector - -This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance. - -## Usage - -To quantize a CLIP visual projector model, use the following command: - -```sh -./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf -``` - -After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc). - -### Arguments - -- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format. -- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved. -- ``: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`. - -### Quantization Types - -The following quantization types are supported, based on the `enum ggml_type` definition: - -- `2` - `q4_0`: 4-bit quantization with a single scale value. -- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block. -- `6` - `q5_0`: 5-bit quantization with a single scale value. -- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block. -- `8` - `q8_0`: 8-bit quantization with a single scale value. - -### Example - -To quantize a model using the `q4_0` quantization type, you would run: - -```sh -./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2 -``` - -This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method. - -## Notes - -- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements. -- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments. diff --git a/examples/llava/clip-quantize-cli.cpp b/examples/llava/clip-quantize-cli.cpp deleted file mode 100644 index 566506954..000000000 --- a/examples/llava/clip-quantize-cli.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include "arg.h" -#include "base64.hpp" -#include "log.h" -#include "common.h" -#include "sampling.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" -#include "ggml.h" - -static void print_usage(int argc, char ** argv) { - (void) argc; - - fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); - fprintf(stderr, " type = 6 - q5_0\n"); - fprintf(stderr, " type = 7 - q5_1\n"); - fprintf(stderr, " type = 8 - q8_0\n"); -} - -int main(int argc, char ** argv) { - if (argc != 4) { - print_usage(argc, argv); - return 1; - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const int itype = atoi(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); - } - - return 0; -} diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 271cf2a2a..d23e282fb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3,31 +3,22 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" +#include "log.h" #include "ggml.h" -#include "ggml-cpu.h" #include "ggml-alloc.h" #include "ggml-backend.h" -#include "gguf.h" -//#ifdef GGML_USE_CUDA -//#include "ggml-cuda.h" -//#endif -// -//#ifdef GGML_USE_SYCL -//#include "ggml-sycl.h" -//#endif -// -//#ifdef GGML_USE_METAL -//#include "ggml-metal.h" -//#endif -// -//#ifdef GGML_USE_CANN -//#include "ggml-cann.h" -//#endif -// -//#ifdef GGML_USE_VULKAN -//#include "ggml-vulkan.h" -//#endif +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -45,18 +36,6 @@ #include #include -#if defined(LLAVA_LOG_OFF) -# define LOG_INF(...) -# define LOG_WRN(...) -# define LOG_ERR(...) -# define LOG_DBG(...) -#else // defined(LLAVA_LOG_OFF) -# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#endif // defined(LLAVA_LOG_OFF) - //#define CLIP_DEBUG_FUNCTIONS // RGB uint8 image @@ -95,31 +74,26 @@ static std::string format(const char * fmt, ...) { // key constants // -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" -#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" -#define KEY_MINICPMV_VERSION "clip.minicpmv_version" -#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_USE_SILU "clip.use_silu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -133,8 +107,7 @@ static std::string format(const char * fmt, ...) { #define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat -#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" +#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" @@ -154,31 +127,12 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" -#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" -#define TN_MINICPMV_QUERY "resampler.query" -#define TN_MINICPMV_PROJ "resampler.proj.weight" -#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" -#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" -#define TN_MINICPMV_LN "resampler.ln_%s.%s" - -#define TN_GLM_ADAPER_CONV "adapter.conv.%s" -#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s" -#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s" -#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" -#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" -#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" -#define TN_GLM_BOI_W "adapter.boi" -#define TN_GLM_EOI_W "adapter.eoi" - enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, - PROJECTOR_TYPE_RESAMPLER, - PROJECTOR_TYPE_GLM_EDGE, - PROJECTOR_TYPE_MERGER, PROJECTOR_TYPE_UNKNOWN, }; @@ -186,9 +140,6 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_RESAMPLER, "resampler"}, - { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, }; @@ -199,7 +150,7 @@ static std::map PROJECTOR_TYPE_NAMES = { static int get_key_idx(const gguf_context * ctx, const char * key) { int i = gguf_find_key(ctx, key); if (i == -1) { - LOG_ERR("key %s not found in file\n", key); + LOG_TEE("key %s not found in file\n", key); throw std::runtime_error(format("Missing required key: %s", key)); } @@ -249,20 +200,17 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; } - std::string builder; - builder.reserve(s.length()); - size_t pos = 0; - size_t last_pos = 0; - while ((pos = s.find(search, last_pos)) != std::string::npos) { - builder.append(s, last_pos, pos - last_pos); - builder.append(replace); - last_pos = pos + search.length(); - } - builder.append(s, last_pos, std::string::npos); - s = std::move(builder); + s = std::move(result); } static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { @@ -275,7 +223,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); int arr_n = gguf_get_arr_n(ctx_gguf, i); - const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); + const void * data = gguf_get_arr_data(ctx_gguf, i); std::stringstream ss; ss << "["; for (int j = 0; j < arr_n; j++) { @@ -304,7 +252,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { size_t tensor_size = ggml_nbytes(tensor); - LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", prefix, ggml_n_dims(tensor), tensor->name, tensor_size, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); } @@ -322,7 +270,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -341,7 +289,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -481,8 +429,7 @@ struct clip_vision_model { // embeddings struct ggml_tensor * class_embedding; - struct ggml_tensor * patch_embeddings_0; - struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_bias; struct ggml_tensor * position_embeddings; @@ -512,12 +459,6 @@ struct clip_vision_model { struct ggml_tensor * mm_4_w = NULL; struct ggml_tensor * mm_4_b = NULL; - //GLMV-Edge projection - struct ggml_tensor * mm_model_adapter_conv_w; - struct ggml_tensor * mm_model_adapter_conv_b; - struct ggml_tensor * boi_w; - struct ggml_tensor * eoi_w; - // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w; struct ggml_tensor * mm_model_mlp_1_b; @@ -551,36 +492,12 @@ struct clip_vision_model { struct ggml_tensor * mm_model_mlp_2_b; struct ggml_tensor * mm_model_peg_0_w; struct ggml_tensor * mm_model_peg_0_b; - - // MINICPMV projection - struct ggml_tensor * mm_model_pos_embed_k; - struct ggml_tensor * mm_model_query; - struct ggml_tensor * mm_model_proj; - struct ggml_tensor * mm_model_kv_proj; - struct ggml_tensor * mm_model_attn_q_w; - struct ggml_tensor * mm_model_attn_q_b; - struct ggml_tensor * mm_model_attn_k_w; - struct ggml_tensor * mm_model_attn_k_b; - struct ggml_tensor * mm_model_attn_v_w; - struct ggml_tensor * mm_model_attn_v_b; - struct ggml_tensor * mm_model_attn_o_w; - struct ggml_tensor * mm_model_attn_o_b; - struct ggml_tensor * mm_model_ln_q_w; - struct ggml_tensor * mm_model_ln_q_b; - struct ggml_tensor * mm_model_ln_kv_w; - struct ggml_tensor * mm_model_ln_kv_b; - struct ggml_tensor * mm_model_ln_post_w; - struct ggml_tensor * mm_model_ln_post_b; }; struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; bool has_llava_projector = false; - bool has_minicpmv_projector = false; - bool has_glm_projector = false; - bool has_qwen2vl_merger = false; - int minicpmv_version = 2; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -588,7 +505,6 @@ struct clip_ctx { float image_mean[3]; float image_std[3]; bool use_gelu = false; - bool use_silu = false; int32_t ftype = 1; bool has_class_embedding = true; @@ -606,58 +522,31 @@ struct clip_ctx { ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; - - struct clip_image_size * load_image_size; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; } const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { - if (load_image_size == nullptr) { - load_image_size = clip_image_size_init(); - } - LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); - image_size_width = load_image_size->width; - image_size_height = load_image_size->height; - if (is_inf) { - image_size_width = imgs->data->nx; - image_size_height = imgs->data->ny; - } - } - else if (ctx->has_qwen2vl_merger) { - // use the image's native resolution when image is avaible - if (is_inf) { - // if (imgs->data->nx && imgs->data->ny) { - image_size_width = imgs->data->nx; - image_size_height = imgs->data->ny; - } - } + const int image_size = hparams.image_size; const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int patches_w = image_size_width / patch_size; - const int patches_h = image_size_height / patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); - const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; - int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer; const float eps = hparams.eps; - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; const int batch_size = imgs->size; - if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { + if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); } @@ -670,79 +559,39 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); - struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - if (ctx->has_qwen2vl_merger) { - GGML_ASSERT(image_size_width % (patch_size * 2) == 0); - GGML_ASSERT(image_size_height % (patch_size * 2) == 0); - - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_reshape_4d( - ctx0, inp, - hidden_size * 2, patches_w / 2, patches_h, batch_size); - inp = ggml_reshape_4d( - ctx0, inp, - hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); - inp = ggml_reshape_3d( - ctx0, inp, - hidden_size, patches_w * patches_h, batch_size); - } - else { - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - } + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); if (ctx->has_patch_bias) { // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); inp = ggml_add(ctx0, inp, model.patch_bias); } - struct ggml_tensor * embeddings = inp; - struct ggml_tensor * pos_embed = nullptr; - if (ctx->has_llava_projector) { - // concat class_embeddings and patch_embeddings - if (ctx->has_class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + // concat class_embeddings and patch_embeddings + struct ggml_tensor * embeddings = inp; + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); } - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); ggml_set_input(positions); - if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding - embeddings = - ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); - } - - if (ctx->has_minicpmv_projector) { - int pos_w = image_size_width/patch_size; - int pos_h = image_size_height/patch_size; - if (ctx->minicpmv_version == 2) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 3) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 4) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - ggml_set_name(pos_embed, "pos_embed"); - ggml_set_input(pos_embed); - } + embeddings = + ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); // pre-layernorm if (ctx->has_pre_norm) { @@ -753,9 +602,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers - if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { - n_layer += 1; - } for (int il = 0; il < n_layer - 1; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states @@ -775,13 +621,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { - Q = ggml_rope_multi( - ctx0, Q, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - } Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); @@ -789,11 +630,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { - K = ggml_rope_multi( - ctx0, K, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - } K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); @@ -833,8 +669,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->use_gelu) { cur = ggml_gelu_inplace(ctx0, cur); - } else if (ctx->use_silu) { - cur = ggml_silu_inplace(ctx0, cur); } else { cur = ggml_gelu_quick_inplace(ctx0, cur); } @@ -846,7 +680,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 cur = ggml_add(ctx0, embeddings, cur); embeddings = cur; - } // post-layernorm @@ -858,7 +691,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // llava projector - if (ctx->has_llava_projector) { + { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); @@ -879,8 +712,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - } - else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); @@ -918,7 +751,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] @@ -966,7 +799,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_2 { // stride = 2 - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm @@ -1027,7 +860,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // mlp_2 ne [24, 24, 2048, 1] mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); @@ -1035,123 +868,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); embeddings = peg_0; } - else { - GGML_ABORT("fatal error"); - } - } - // minicpmv projector - else if (ctx->has_minicpmv_projector) - { - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - struct ggml_tensor * q = model.mm_model_query; - { // layernorm - q = ggml_norm(ctx0, q, eps); - q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - } - struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - { // layernorm - v = ggml_norm(ctx0, v, eps); - v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); - } - struct ggml_tensor * k; - { // position - // q = ggml_add(ctx0, q, model.mm_model_pos_embed); - k = ggml_add(ctx0, v, pos_embed); - } - - { // attention - int hidden_size = 4096; - const int d_head = 128; - int n_head = hidden_size/d_head; - int num_query = 96; - if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; - num_query = 96; - } - else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - else if (ctx->minicpmv_version == 4) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); - struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); - struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); - // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_inplace(ctx0, KQ); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); - } - { // layernorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); - } - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); - } else { GGML_ASSERT(false); } } - // glm projector - else if (ctx->has_glm_projector) { - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - //GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - struct ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_silu_inplace(ctx0, embeddings); - embeddings = ggml_mul(ctx0, embeddings,x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - } else { - GGML_ABORT("fatel error"); - } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); - - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); - } // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1185,21 +905,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const int idx_name = gguf_find_key(ctx, KEY_NAME); if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug const std::string name = gguf_get_val_str(ctx, idx_name); - LOG_INF("%s: model name: %s\n", __func__, name.c_str()); + LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); } - LOG_INF("%s: description: %s\n", __func__, description.c_str()); - LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); - LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); - LOG_INF("%s: n_kv: %d\n", __func__, n_kv); - LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str()); - LOG_INF("\n"); + LOG_TEE("%s: description: %s\n", __func__, description.c_str()); + LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); + LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_TEE("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); // kv const int n_kv = gguf_get_n_kv(ctx); - LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", __func__, n_kv, n_tensors, fname); { std::map n_type; @@ -1210,7 +930,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { n_type[type]++; } - LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i); @@ -1226,7 +946,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } replace_all(value, "\n", "\\n"); - LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -1235,7 +955,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { continue; } - LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -1250,13 +970,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { size_t tensor_size = ggml_nbytes(cur); model_size += tensor_size; if (verbosity >= 3) { - LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); } } } - clip_ctx * new_clip = new clip_ctx{}; + clip_ctx * new_clip = new clip_ctx; // update projector type { @@ -1275,34 +995,25 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -//#ifdef GGML_USE_CUDA -// new_clip->backend = ggml_backend_cuda_init(0); -// LOG_INF("%s: CLIP using CUDA backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_METAL -// new_clip->backend = ggml_backend_metal_init(); -// LOG_INF("%s: CLIP using Metal backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_CANN -// new_clip->backend = ggml_backend_cann_init(0); -// LOG_INF("%s: CLIP using CANN backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_VULKAN -// new_clip->backend = ggml_backend_vk_init(0); -// LOG_INF("%s: CLIP using Vulkan backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_SYCL -// new_clip->backend = ggml_backend_sycl_init(0); -// LOG_INF("%s: CLIP using SYCL backend\n", __func__); -//#endif +#ifdef GGML_USE_CUDA + new_clip->backend = ggml_backend_cuda_init(0); + LOG_TEE("%s: CLIP using CUDA backend\n", __func__); +#endif + +#ifdef GGML_USE_METAL + new_clip->backend = ggml_backend_metal_init(); + LOG_TEE("%s: CLIP using Metal backend\n", __func__); +#endif + +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_TEE("%s: CLIP using CANN backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); - LOG_INF("%s: CLIP using CPU backend\n", __func__); + LOG_TEE("%s: CLIP using CPU backend\n", __func__); } // model size and capabilities @@ -1318,52 +1029,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); } - idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); - if (idx != -1) { - new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); - } - - idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION); - if (idx != -1) { - new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); - } - - idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ); - if (idx != -1) { - new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx); - } - - idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER); - if (idx != -1) { - new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx); - } - // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search - + GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); idx = get_key_idx(ctx, KEY_USE_GELU); new_clip->use_gelu = gguf_get_val_bool(ctx, idx); - try { - idx = get_key_idx(ctx, KEY_USE_SILU); - new_clip->use_silu = gguf_get_val_bool(ctx, idx); - } catch (std::runtime_error & /*e*/) { - new_clip->use_silu = false; - } - if (verbosity >= 1) { - LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); - LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); - LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); - LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector); - LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); - LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } - LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // load tensors { @@ -1376,7 +1058,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->ctx_data = ggml_init(params); if (!new_clip->ctx_data) { - LOG_ERR("%s: ggml_init() failed\n", __func__); + LOG_TEE("%s: ggml_init() failed\n", __func__); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1384,7 +1066,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { - LOG_ERR("cannot open model file for loading tensors\n"); + LOG_TEE("cannot open model file for loading tensors\n"); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1406,7 +1088,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { - LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name); + LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1477,23 +1159,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (verbosity >= 2) { - LOG_INF("\n%s: vision model hparams\n", __func__); - LOG_INF("image_size %d\n", hparams.image_size); - LOG_INF("patch_size %d\n", hparams.patch_size); - LOG_INF("v_hidden_size %d\n", hparams.hidden_size); - LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate); - LOG_INF("v_projection_dim %d\n", hparams.projection_dim); - LOG_INF("v_n_head %d\n", hparams.n_head); - LOG_INF("v_n_layer %d\n", hparams.n_layer); - LOG_INF("v_eps %f\n", hparams.eps); - LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); - LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); - LOG_INF("v_image_grid_pinpoints: "); + LOG_TEE("\n%s: vision model hparams\n", __func__); + LOG_TEE("image_size %d\n", hparams.image_size); + LOG_TEE("patch_size %d\n", hparams.patch_size); + LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); + LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); + LOG_TEE("v_n_head %d\n", hparams.n_head); + LOG_TEE("v_n_layer %d\n", hparams.n_layer); + LOG_TEE("v_eps %f\n", hparams.eps); + LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_TEE("v_image_grid_pinpoints: "); for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - LOG_INF("%d ", hparams.image_grid_pinpoints[i]); + LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); } - LOG_INF("\n"); - LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + LOG_TEE("\n"); + LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } @@ -1528,15 +1210,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } try { - vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { - LOG_ERR("%s: failed to load vision model tensors\n", __func__); - } - try { - vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1); - } catch(const std::exception& /*e*/) { - new_clip->has_qwen2vl_merger = false; + LOG_TEE("%s: failed to load vision model tensors\n", __func__); } // LLaVA projection @@ -1565,7 +1242,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); - // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__); + // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1604,45 +1281,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); } - else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { - // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); - vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); - vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); - vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); - vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); - vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); - vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); - vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); - vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); - vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); - vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); - vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); - vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); - vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); - vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); - vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); - vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); - vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); - vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); - } - else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight")); - vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias")); - vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight")); - vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight")); - vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias")); - vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); - vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight")); - vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); - vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W); - vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W); - } - else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) { - vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); - vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1681,31 +1319,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; - batch.data = nullptr; - ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } return new_clip; } -void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { - ctx_clip->load_image_size = load_image_size; -} - -struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { - return ctx_clip->load_image_size; -} - -struct clip_image_size * clip_image_size_init() { - struct clip_image_size * load_image_size = new struct clip_image_size(); - load_image_size->width = 448; - load_image_size->height = 448; - return load_image_size; -} - struct clip_image_u8 * clip_image_u8_init() { return new clip_image_u8(); } @@ -1740,7 +1362,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { int nx, ny, nc; auto * data = stbi_load(fname, &nx, &ny, &nc, 3); if (!data) { - LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); + LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1752,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length int nx, ny, nc; auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); if (!data) { - LOG_ERR("%s: failed to decode image bytes\n", __func__); + LOG_TEE("%s: failed to decode image bytes\n", __func__); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1811,7 +1433,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* } } -inline int clip(int x, int lower, int upper) { +inline float clip(float x, float lower, float upper) { return std::max(lower, std::min(x, upper)); } @@ -1942,7 +1564,7 @@ static std::pair select_best_resolution(const std::pair & or int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -1976,224 +1598,12 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im return patches; } -static int ensure_divide(int length, int patch_size) { - return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); -} - -static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.first; - int height = original_size.second; - if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { - float r = static_cast(width) / height; - height = static_cast(scale_resolution / std::sqrt(r)); - width = static_cast(height * r); - } - int best_width = ensure_divide(width, patch_size); - int best_height = ensure_divide(height, patch_size); - return std::make_pair(best_width, best_height); -} - -static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width, height; - std::tie(width, height) = original_size; - int grid_x, grid_y; - std::tie(grid_x, grid_y) = grid; - - int refine_width = ensure_divide(width, grid_x); - int refine_height = ensure_divide(height, grid_y); - - int grid_width = refine_width / grid_x; - int grid_height = refine_height / grid_y; - - // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) - auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair - int best_grid_width, best_grid_height; - std::tie(best_grid_width, best_grid_height) = best_grid_size; - - // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) - std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) - return refine_size; -} - -static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { - std::vector candidate_split_grids_nums; - for (int i : {multiple - 1, multiple, multiple + 1}) { - if (i == 1 || i > max_slice_nums) { - continue; - } - candidate_split_grids_nums.push_back(i); - } - - std::vector> candidate_grids; - for (int split_grids_nums : candidate_split_grids_nums) { - int m = 1; - while (m <= split_grids_nums) { - if (split_grids_nums % m == 0) { - candidate_grids.emplace_back(m, split_grids_nums / m); - } - ++m; - } - } - - std::pair best_grid{1, 1}; - float min_error = std::numeric_limits::infinity(); - for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); - if (error < min_error) { - best_grid = grid; - min_error = error; - } - } - return best_grid; -} - -// inspired from LLaVA-UHD: -// -> https://arxiv.org/pdf/2403.11703 -// -> https://github.com/thunlp/LLaVA-UHD -// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 -static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { - const std::pair original_size={img->nx,img->ny}; - const int original_width = img->nx; - const int original_height = img->ny; - const float log_ratio = log(1.0*original_width/original_height); - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - - std::vector> images; - LOG_INF("%s: multiple %d\n", __func__, multiple); - images.push_back(std::vector()); - - if (multiple <= 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); - clip_image_u8 * source_image = clip_image_u8_init(); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.resize(best_size, Image.Resampling.BICUBIC) - images[images.size()-1].push_back(source_image); - } - else if (multiple > 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); - clip_image_u8 * source_image = clip_image_u8_init(); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); - images[images.size()-1].push_back(source_image); - - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); - - auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); - clip_image_u8 * refine_image = clip_image_u8_init(); - bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - - LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); - - // split_to_patches - int width = refine_image->nx; - int height = refine_image->ny; - int grid_x = int(width / best_grid.first); - int grid_y = int(height / best_grid.second); - for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ - images.push_back(std::vector()); - for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ - clip_image_u8 * patch = clip_image_u8_init(); - patch->nx = grid_x; - patch->ny = grid_y; - patch->buf.resize(3 * patch->nx * patch->ny); - for (int y = patches_i; y < patches_i + grid_y; ++y) { - for (int x = patches_j; x < patches_j + grid_x; ++x) { - const int i = 3 * (y * refine_image->nx + x); - const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); - patch->buf[j] = refine_image->buf[i]; - patch->buf[j+1] = refine_image->buf[i+1]; - patch->buf[j+2] = refine_image->buf[i+2]; - } - } - images[images.size()-1].push_back(patch); - } - } - clip_image_u8_free(refine_image); - } - return images; -} - -int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { - const int max_slice_nums=9; - const int scale_resolution=448; - const int original_width = ctx_clip->load_image_size->width; - const int original_height = ctx_clip->load_image_size->height; - const float log_ratio = log(1.0*original_width/original_height); - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - return best_grid.first; -} - // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { - - if(clip_is_minicpmv(ctx)){ - int max_slice_nums = 9; - std::vector> imgs = uhd_slice_image(img, max_slice_nums); - res_imgs->size = 0; - for (size_t i = 0; i < imgs.size(); ++i){ - res_imgs->size += imgs[i].size(); - } - res_imgs->data = new clip_image_f32[res_imgs->size]; - int idx = 0; - for (size_t i = 0; i < imgs.size(); ++i) { - for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); - clip_image_f32 * res = clip_image_f32_init(); - normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); - res_imgs->data[idx++] = *res; - clip_image_f32_free(res); - } - } - for (size_t i = 0; i < imgs.size(); ++i) { - for (size_t j = 0; j < imgs[i].size(); ++j) { - if (imgs[i][j] != nullptr) { - clip_image_u8_free(imgs[i][j]); - } - } - } - return true; - } - else if (ctx->has_qwen2vl_merger) { - clip_image_u8 * resized = clip_image_u8_init(); - auto patch_size = clip_patch_size(ctx) * 2; - int nx = ceil((float)img->nx / patch_size) * patch_size; - int ny = ceil((float)img->ny / patch_size) * patch_size; - bicubic_resize(*img, *resized, nx, ny); - - res_imgs->data = new clip_image_f32[1]; - // clip_image_f32 * res = clip_image_f32_init(); - normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); - // res_imgs->data[0] = *res; - res_imgs->size = 1; - - // clip_image_f32_free(res); - clip_image_u8_free(resized); - return true; - } - - if (ctx->has_glm_projector) { - res_imgs->size = 1; - res_imgs->data = new clip_image_f32[res_imgs->size]; - clip_image_u8 resized_image; - int32_t sz=ctx->vision_model.hparams.image_size; - bicubic_resize(*img, resized_image,sz,sz); - clip_image_f32 * res = clip_image_f32_init(); - //clip_image_save_to_bmp(resized_image, "resized.bmp"); - normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std); - res_imgs->data[0] = *res; - clip_image_f32_free(res); - return true; - } - bool pad_to_square = true; if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; } auto & params = ctx->vision_model.hparams; @@ -2270,7 +1680,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } for (size_t i = 0; i < patches.size(); i++) { - // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } @@ -2376,15 +1786,7 @@ void clip_free(clip_ctx * ctx) { } size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - int extra_tokens = ctx->has_glm_projector ? 2 : 0; - return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { - clip_image_f32 img; - img.nx = img_w; - img.ny = img_h; - return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); + return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); } int32_t clip_image_size(const struct clip_ctx * ctx) { @@ -2408,128 +1810,20 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) { } int clip_n_patches(const struct clip_ctx * ctx) { - clip_image_f32 img; - img.nx = ctx->vision_model.hparams.image_size; - img.ny = ctx->vision_model.hparams.image_size; - return clip_n_patches_by_img(ctx, &img); -} - -int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); - if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { n_patches /= 4; - } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - n_patches = 96; - } - else if (ctx->minicpmv_version == 3) { - n_patches = 64; - } - else if (ctx->minicpmv_version == 4) { - n_patches = 64; - } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - int patch_size = params.patch_size * 2; - int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); - int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); - n_patches = x_patch * y_patch; } return n_patches; } -static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { - assert(embed_dim % 2 == 0); - int H = pos.size(); - int W = pos[0].size(); - - std::vector omega(embed_dim / 2); - for (int i = 0; i < embed_dim / 2; ++i) { - omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); - } - - std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int d = 0; d < embed_dim / 2; ++d) { - float out_value = pos[h][w] * omega[d]; - emb[h][w][d] = sin(out_value); - emb[h][w][d + embed_dim / 2] = cos(out_value); - } - } - } - - return emb; -} - -static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { - assert(embed_dim % 2 == 0); - std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) - std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) - - int H = emb_h.size(); - int W = emb_h[0].size(); - std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); - - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - for (int d = 0; d < embed_dim / 2; ++d) { - emb[h][w][d] = emb_h[h][w][d]; - emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; - } - } - } - return emb; -} - -static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { - int grid_h_size = image_size.first; - int grid_w_size = image_size.second; - - std::vector grid_h(grid_h_size); - std::vector grid_w(grid_w_size); - - for (int i = 0; i < grid_h_size; ++i) { - grid_h[i] = static_cast(i); - } - for (int i = 0; i < grid_w_size; ++i) { - grid_w[i] = static_cast(i); - } - - std::vector> grid(grid_h_size, std::vector(grid_w_size)); - for (int h = 0; h < grid_h_size; ++h) { - for (int w = 0; w < grid_w_size; ++w) { - grid[h][w] = grid_w[w]; - } - } - std::vector>> grid_2d = {grid, grid}; - for (int h = 0; h < grid_h_size; ++h) { - for (int w = 0; w < grid_w_size; ++w) { - grid_2d[0][h][w] = grid_h[h]; - grid_2d[1][h][w] = grid_w[w]; - } - } - - std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); - - int H = image_size.first; - int W = image_size.second; - std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; - } - } - - return pos_embed_2d; -} - bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; } @@ -2541,7 +1835,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; } @@ -2549,39 +1843,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } - if (ctx->has_minicpmv_projector) { - GGML_ASSERT(batch_size == 1); - } - if (ctx->has_glm_projector) { - GGML_ASSERT(batch_size == 1); - ggml_tensor * boi = ctx->vision_model.boi_w; - ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi)); - vec = (float*)(vec+ggml_nelements(boi)); //offset for boi - } // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { - image_size_width = imgs->data[0].nx; - image_size_height = imgs->data[0].ny; - } + const int image_size = hparams.image_size; const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); - if(ctx->load_image_size==nullptr){ - ctx->load_image_size= clip_image_size_init(); - } - const int pos_w = ctx->load_image_size->width/patch_size; - const int pos_h = ctx->load_image_size->height/patch_size; { struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); @@ -2590,9 +1864,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; const int ny = imgs->data[i].ny; - if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { - GGML_ASSERT(nx == image_size && ny == image_size); - } + GGML_ASSERT(nx == image_size && ny == image_size); const int n = nx * ny; @@ -2609,144 +1881,65 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); free(data); } - if (ctx->has_minicpmv_projector) { - { - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - int bucket_coords_h[1024]; - int bucket_coords_w[1024]; - for (int i = 0; i < pos_h; i++){ - bucket_coords_h[i] = std::floor(70.0*i/pos_h); - } - for (int i = 0; i < pos_w; i++){ - bucket_coords_w[i] = std::floor(70.0*i/pos_w); - } - for (int i = 0, id = 0; i < pos_h; i++){ - for (int j = 0; j < pos_w; j++){ - positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; - } - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - { - // inspired from resampler of Qwen-VL: - // -> https://huggingface.co/Qwen/Qwen-VL/tree/main - // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 - struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - int embed_dim = 4096; - if (ctx->minicpmv_version == 2) { - embed_dim = 4096; - } - else if (ctx->minicpmv_version == 3) { - embed_dim = 3584; - } - else if (ctx->minicpmv_version == 4) { - embed_dim = 3584; - } - auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + { + if (ctx->has_class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;i < pos_w * pos_h; ++i){ - for(int j=0; j < embed_dim; ++j){ - pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j]; - } - } - - ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); - free(pos_embed_data); + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); } } - else{ - { - if (ctx->has_class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } - if (ctx->has_qwen2vl_merger) { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - const int pw = image_size_width / patch_size; - const int ph = image_size_height / patch_size; - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - - int ptr = 0; - for (int y = 0; y < ph; y+=2) - { - for (int x = 0; x < pw; x+=2) - { - for (int dy = 0; dy < 2; dy++) { - for (int dx = 0; dx < 2; dx++) { - positions_data[ptr] = y + dy; - positions_data[num_patches + ptr] = x + dx; - positions_data[num_patches * 2 + ptr] = y + dy; - positions_data[num_patches * 3 + ptr] = x + dx; - ptr++; - } - } - } - } - - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - else { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - - if (!ctx->has_glm_projector) { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; - } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } if (ggml_backend_is_cpu(ctx->backend)) { ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(ctx->backend)) { + ggml_backend_metal_set_n_cb(ctx->backend, n_threads); + } +#endif + ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor - struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - if (ctx->has_glm_projector) { - //eoi - ggml_tensor * eoi = ctx->vision_model.eoi_w; - int offset = ggml_nelements(embeddings); - ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi)); - } - return true; } bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { + ggml_type type = GGML_TYPE_Q4_1; + assert(itype < GGML_TYPE_COUNT); - ggml_type type = static_cast(itype); + type = static_cast(itype); auto * ctx_clip = clip_model_load(fname_inp, 2); @@ -2799,14 +1992,14 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } } - // quantize only 2D tensors and bigger than block size - quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type); + // quantize only 2D tensors + quantize &= (ggml_n_dims(cur) == 2); if (quantize) { new_type = type; if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); } const size_t n_elms = ggml_nelements(cur); float * f32_data; @@ -2825,7 +2018,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i f32_data = (float *)conv_buf.data(); break; default: - LOG_ERR("Please use an input file in f32 or f16\n"); + LOG_TEE("Please use an input file in f32 or f16\n"); gguf_free(ctx_out); return false; } @@ -2845,15 +2038,14 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i total_size_org += orig_size; total_size_new += new_size; gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); fout.write((const char *)new_data, new_size); size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; for (size_t j = 0; j < pad; ++j) { fout.put(0); } - LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } @@ -2869,8 +2061,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i gguf_free(ctx_out); { - LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); } return true; @@ -2889,52 +2081,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - return 4096; - } - else if (ctx->minicpmv_version == 3) { - return 3584; - } - else if (ctx->minicpmv_version == 4) { - return 3584; - } - } - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ - return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - return ctx->vision_model.mm_1_b->ne[0]; - } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } - -int clip_is_minicpmv(const struct clip_ctx * ctx) { - if (ctx->has_minicpmv_projector) { - return ctx->minicpmv_version; - } - return 0; -} - -bool clip_is_glm(const struct clip_ctx * ctx) { - return ctx->has_glm_projector; -} -bool clip_is_qwen2vl(const struct clip_ctx * ctx) { - return ctx->has_qwen2vl_merger; -} - - -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { - clip_image_f32 clip_img; - clip_img.buf.resize(h * w * 3); - for (int i = 0; i < h*w*3; i++) - { - clip_img.buf[i] = img[i]; - } - clip_img.nx = w; - clip_img.ny = h; - clip_image_encode(ctx, n_threads, &clip_img, vec); - return true; -} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 841b4f6f9..ca3631384 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -18,17 +18,14 @@ # define CLIP_API #endif +struct clip_ctx; + #ifdef __cplusplus extern "C" { #endif struct clip_ctx; -struct clip_image_size { - int width; - int height; -}; - struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -45,7 +42,6 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); -CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); @@ -56,15 +52,9 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); -CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); +CLIP_API int clip_n_patches (const struct clip_ctx * ctx); +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); -CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); -CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); -CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); - -CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -88,13 +78,6 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); -CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); -CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); - -CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); - -CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); - #ifdef __cplusplus } #endif diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py index 4fa1d6cea..36f6b92fb 100644 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ b/examples/llava/convert_image_encoder_to_gguf.py @@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu) if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) + model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) @@ -288,7 +288,7 @@ if has_llava_projector: print("Projector tensors added\n") -state_dict = model.state_dict() +state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] for name, data in state_dict.items(): if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): # we don't need this diff --git a/examples/llava/glmedge-convert-image-encoder-to-gguf.py b/examples/llava/glmedge-convert-image-encoder-to-gguf.py deleted file mode 100644 index 848ef1cf3..000000000 --- a/examples/llava/glmedge-convert-image-encoder-to-gguf.py +++ /dev/null @@ -1,280 +0,0 @@ -import argparse -import os -import json -import re - -import torch -import numpy as np -from gguf import * - -TEXT = "clip.text" -VISION = "clip.vision" -from transformers import SiglipVisionModel, SiglipVisionConfig - -def k(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: - if name in ( - "logit_scale", - "text_model.embeddings.position_ids", - "vision_model.embeddings.position_ids", - ): - return True - - if name in ( - "vision_model.head.probe", - "vision_model.head.attention.in_proj_weight", - "vision_model.head.attention.in_proj_bias", - "vision_model.head.attention.out_proj.weight", - "vision_model.head.attention.out_proj.bias", - "vision_model.head.layernorm.weight", - "vision_model.head.layernorm.bias", - "vision_model.head.mlp.fc1.weight", - "vision_model.head.mlp.fc1.bias", - "vision_model.head.mlp.fc2.weight", - "vision_model.head.mlp.fc2.bias" - ): - return True - - if name.startswith("v") and not has_vision: - return True - - if name.startswith("t") and not has_text: - return True - - return False - - -def get_tensor_name(name: str) -> str: - if "projection" in name: - return name - if "mm_projector" in name: - name = name.replace("model.mm_projector", "mm") - name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) - name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) - return name - - return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") - - -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) -ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") -ap.add_argument("--text-only", action="store_true", required=False, - help="Save a text-only model. It can't be used to encode images") -ap.add_argument("--vision-only", action="store_true", required=False, - help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip-model-is-vision", action="store_true", required=False, - help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, - help="The clip model is from openclip (for ViT-SO400M type))") -ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter") -ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) -# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 -# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.5, 0.5, 0.5] -default_image_std = [0.5, 0.5, 0.5] -ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) - -# with proper -args = ap.parse_args() - - -if args.text_only and args.vision_only: - print("--text-only and --image-only arguments cannot be specified at the same time.") - exit(1) - -if args.use_f32: - print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") - -# output in the same directory as the model if output_dir is None -dir_model = args.model_dir - -if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: - vocab = None - tokens = None -else: - with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - vocab = json.load(f) - tokens = [key for key in vocab] - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - config = json.load(f) - if args.clip_model_is_vision: - v_hparams = config - t_hparams = None - else: - v_hparams = config["vision_config"] - t_hparams = None - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if args.use_f32: - ftype = 0 - -vision_config = SiglipVisionConfig(**v_hparams) -model = SiglipVisionModel(vision_config) -model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip"))) - -fname_middle = None -has_text_encoder = False -has_vision_encoder = True -has_glm_projector = True -if args.text_only: - fname_middle = "text-" - has_vision_encoder = False -elif args.llava_projector is not None: - fname_middle = "mmproj-" - has_text_encoder = False - has_glm_projector = True -elif args.vision_only: - fname_middle = "vision-" - has_text_encoder = False -else: - fname_middle = "" - -output_dir = args.output_dir if args.output_dir is not None else dir_model -os.makedirs(output_dir, exist_ok=True) -output_prefix = os.path.basename(output_dir).replace("ggml_", "") -fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") -fout = GGUFWriter(path=fname_out, arch="clip") - -fout.add_bool("clip.has_text_encoder", has_text_encoder) -fout.add_bool("clip.has_vision_encoder", has_vision_encoder) -fout.add_bool("clip.has_glm_projector", has_glm_projector) -fout.add_file_type(ftype) -model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) -fout.add_name(model_name) -if has_glm_projector: - fout.add_description("image encoder for glm4v") - fout.add_string("clip.projector_type", "adapter") -else: - fout.add_description("two-tower CLIP model") - -if has_text_encoder: - assert t_hparams is not None - assert tokens is not None - # text_model hparams - fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) - fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) - fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) - fout.add_token_list(tokens) - -if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) - fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) - fout.add_uint32("clip.vision.projection_dim", 0) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) - fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"]) - - image_mean = args.image_mean if args.image_mean is not None else default_image_mean - image_std = args.image_std if args.image_std is not None else default_image_std - fout.add_array("clip.vision.image_mean", image_mean) - fout.add_array("clip.vision.image_std", image_std) - -fout.add_bool("clip.use_gelu", True) - - -if has_glm_projector: - # model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] - projector = torch.load(args.llava_projector) - for name, data in projector.items(): - name = get_tensor_name(name) - # pw and dw conv ndim==4 - if data.ndim == 2 or data.ndim == 4: - data = data.squeeze().numpy().astype(np.float16) - else: - data = data.squeeze().numpy().astype(np.float32) - if name.startswith("vision."): - name=name.replace("vision.","") - fout.add_tensor(name, data) - print(f"Projector {name} - {data.dtype} - shape = {data.shape}") - # print(f"Projector {name} tensors added\n") - -state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] -for name, data in state_dict.items(): - if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector): - # we don't need this - print(f"skipping parameter: {name}") - continue - - name = get_tensor_name(name) - data = data.squeeze().numpy() - - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if n_dims == 4: - print(f"tensor {name} is always saved in f16") - data = data.astype(np.float16) - ftype_cur = 1 - elif ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - # print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - # print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - # print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - print(f"siglip {name} - {data.dtype} - shape = {data.shape}") - # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - fout.add_tensor(name, data) - - -fout.write_header_to_file() -fout.write_kv_data_to_file() -fout.write_tensors_to_file() -fout.close() - -print("Done. Output file: " + fname_out) diff --git a/examples/llava/glmedge-surgery.py b/examples/llava/glmedge-surgery.py deleted file mode 100644 index 16bb915d0..000000000 --- a/examples/llava/glmedge-surgery.py +++ /dev/null @@ -1,33 +0,0 @@ -import argparse -import os -import torch -from transformers import AutoModel - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", help="Path to GLM model") -args = ap.parse_args() - -# find the model part that includes the the multimodal projector weights -model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) -checkpoint = model.state_dict() - -# get a list of mm tensor names -mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")] - -# store these tensors in a new dictionary and torch.save them -projector = {name: checkpoint[name].float() for name in mm_tensors} -torch.save(projector, f"{args.model}/glm.projector") - -clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")] -if len(clip_tensors) > 0: - clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors} - torch.save(clip, f"{args.model}/glm.clip") - - # added tokens should be removed to be able to convert Mistral models - if os.path.exists(f"{args.model}/added_tokens.json"): - with open(f"{args.model}/added_tokens.json", "w") as f: - f.write("{}\n") - -print("Done!") -print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") -print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.") diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 40aa0876f..8c7dd2ae3 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -1,16 +1,14 @@ -#include "arg.h" -#include "base64.hpp" +#include "ggml.h" #include "log.h" #include "common.h" -#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" -#include "ggml.h" + +#include "base64.hpp" #include #include -#include #include static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { @@ -20,8 +18,8 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector n_batch) { n_eval = n_batch; } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) { - LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } *n_past += n_eval; @@ -37,25 +35,21 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } -static const char * sample(struct common_sampler * smpl, +static const char * sample(struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); - common_sampler_accept(smpl, id, true); - - const llama_model * model = llama_get_model(ctx_llama); - const llama_vocab * vocab = llama_model_get_vocab(model); - + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); static std::string ret; - if (llama_vocab_is_eog(vocab, id)) { + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = "
"; } else { - ret = common_token_to_piece(ctx_llama, id); + ret = llama_token_to_piece(ctx_llama, id); } eval_id(ctx_llama, id, n_past); return ret.c_str(); @@ -80,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip size_t img_base64_str_start, img_base64_str_end; find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { - LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + LOG_TEE("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); return NULL; } @@ -94,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); if (!embed) { - LOG_ERR("%s: could not load image from base64 string.\n", __func__); + LOG_TEE("%s: could not load image from base64 string.\n", __func__); return NULL; } @@ -118,29 +112,31 @@ struct llava_context { struct llama_model * model = NULL; }; -static void print_usage(int, char ** argv) { - LOG("\n example usage:\n"); - LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\n example usage:\n"); + LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { +static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { // load and preprocess the image llava_image_embed * embed = NULL; auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - LOG_INF("using base64 encoded image instead of command line image path\n"); + LOG_TEE("using base64 encoded image instead of command line image path\n"); } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); + embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); if (!embed) { - LOG_ERR("%s: can't load image from prompt\n", __func__); + LOG_TEE("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); if (!embed) { fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); return NULL; @@ -150,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, common_p return embed; } -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { +static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) { int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; @@ -161,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_INF("system_prompt: %s\n", system_prompt.c_str()); + LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } - LOG_INF("user_prompt: %s\n", user_prompt.c_str()); + LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -180,9 +176,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; user_prompt = prompt + "\nASSISTANT:"; if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -193,21 +189,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // generate the response - LOG("\n"); + LOG_TEE("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); exit(1); } std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - LOG("%s", tmp); + printf("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -215,25 +211,25 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fflush(stdout); } - common_sampler_free(smpl); - LOG("\n"); + llama_sampling_free(ctx_sampling); + printf("\n"); } -static struct llama_model * llava_init(common_params * params) { +static struct llama_model * llava_init(gpt_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = common_model_params_to_llama(*params); + llama_model_params model_params = llama_model_params_from_gpt_params(*params); - llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_ERR("%s: unable to load model\n" , __func__); + LOG_TEE("%s: error: unable to load model\n" , __func__); return NULL; } return model; } -static struct llava_context * llava_init_context(common_params * params, llama_model * model) { +static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -243,17 +239,18 @@ static struct llava_context * llava_init_context(common_params * params, llama_m auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_context_params ctx_params = common_context_params_to_llama(*params); + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - llama_context * ctx_llama = llama_init_from_model(model, ctx_params); + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_ERR("%s: failed to create the llama_context\n" , __func__); + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); return NULL; } - auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_clip = ctx_clip; @@ -268,65 +265,76 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_model_free(ctx_llava->model); + llama_free_model(ctx_llava->model); llama_backend_free(); } +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + int main(int argc, char ** argv) { ggml_time_init(); - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - common_init(); +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("llava", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv); + print_usage(argc, argv, {}); return 1; } - - auto * model = llava_init(¶ms); + auto model = llava_init(¶ms); if (model == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__); return 1; } if (prompt_contains_image(params.prompt)) { - auto * ctx_llava = llava_init_context(¶ms, model); + auto ctx_llava = llava_init_context(¶ms, model); - auto * image_embed = load_image(ctx_llava, ¶ms, ""); + auto image_embed = load_image(ctx_llava, ¶ms, ""); // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_context_print(ctx_llava->ctx_llama); + llama_print_timings(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); } else { for (auto & image : params.image) { - auto * ctx_llava = llava_init_context(¶ms, model); + auto ctx_llava = llava_init_context(¶ms, model); - auto * image_embed = load_image(ctx_llava, ¶ms, image); + auto image_embed = load_image(ctx_llava, ¶ms, image); if (!image_embed) { - LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); + std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; return 1; } // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_context_print(ctx_llava->ctx_llama); + llama_print_timings(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); } } - llama_model_free(model); + llama_free_model(model); return 0; } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 300714045..63878d176 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -1,27 +1,13 @@ #include "clip.h" -#include "llava.h" - +#include "common.h" #include "llama.h" +#include "llava.h" +#include "base64.hpp" -#include -#include #include #include -#include -#include #include - -#if defined(LLAVA_LOG_OFF) -# define LOG_INF(...) -# define LOG_WRN(...) -# define LOG_ERR(...) -# define LOG_DBG(...) -#else // defined(LLAVA_LOG_OFF) -# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#endif // defined(LLAVA_LOG_OFF) +#include // RGB uint8 image struct clip_image_u8 { @@ -68,7 +54,7 @@ static std::pair select_best_resolution(const std::pair& ori int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -198,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); ggml_build_forward_expand(gf, flatten); ggml_graph_compute_with_ctx(model.ctx, gf, 1); - struct ggml_tensor* result = ggml_graph_node(gf, -1); + struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): @@ -216,33 +202,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector return true; } -static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) { - int width = image->nx; - int height = image->ny; - int num_patches = (height / patch_size) * (width / patch_size); - clip_image_f32 * patch = clip_image_f32_init(); - patch->nx = patch_size * num_patches; - patch->ny = patch_size; - patch->buf.resize(3 * patch->nx * patch->ny); - - int patch_index = 0; - - for (int i = 0; i < height; i += patch_size) { - for (int j = 0; j < width; j += patch_size) { - for (int pi = 0; pi < patch_size; ++pi) { - for (int pj = 0; pj < patch_size; ++pj) { - int input_index = ((i + pi) * width + (j + pj)) * 3; - int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; - patch->buf[output_index] = image->buf[input_index]; - patch->buf[output_index+1] = image->buf[input_index+1]; - patch->buf[output_index+2] = image->buf[input_index+2]; - } - } - patch_index++; - } - } - return patch; -} static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 @@ -250,7 +209,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { - LOG_ERR("%s: unable to preprocess image\n", __func__); + LOG_TEE("%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; } @@ -259,84 +218,17 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { - std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - struct clip_image_size * load_image_size = clip_image_size_init(); - - for (size_t i = 0; i < img_res_v.size; i++) { - const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); - int patch_size=14; - load_image_size->width = img_res_v.data[i].nx; - load_image_size->height = img_res_v.data[i].ny; - clip_add_load_image_size(ctx_clip, load_image_size); - - bool encoded = false; - if (clip_is_qwen2vl(ctx_clip)) { - encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); - } - else { - encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); - } - - if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); - return false; - } - const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - int n_img_pos_out = 0; - for (size_t i = 0; i < image_embd_v.size(); i++) { - std::memcpy( - image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), - image_embd_v[i], - clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); - n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); - } - *n_img_pos = n_img_pos_out; - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - load_image_size->width = img->nx; - load_image_size->height = img->ny; - clip_add_load_image_size(ctx_clip, load_image_size); - LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); - delete[] img_res_v.data; - img_res_v.size = 0; - img_res_v.data = nullptr; - } - else if (clip_is_glm(ctx_clip)){ - struct clip_image_size * load_image_size = clip_image_size_init(); - load_image_size->width = img_res_v.data[0].nx; - load_image_size->height = img_res_v.data[0].ny; - clip_add_load_image_size(ctx_clip, load_image_size); - - bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); - int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2); - *n_img_pos = (pos * pos + 2); - if (!encoded){ - LOG_ERR("Unable to encode image \n"); - return false; - } - } - else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { + if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 delete[] img_res_v.data; if (!encoded) { - LOG_ERR("Unable to encode image\n"); + LOG_TEE("Unable to encode image\n"); return false; } - } - else { + } else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; @@ -345,12 +237,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); @@ -383,50 +275,37 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); } - LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); return true; } bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { // make sure that the correct mmproj was used, i.e., compare apples to apples - int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama)); + int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); auto n_image_embd = clip_n_mmproj_embd(ctx_clip); if (n_image_embd != n_llama_embd) { - LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); return false; } return true; } bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - int num_max_patches = 6; - if (clip_is_minicpmv(ctx_clip)) { - num_max_patches = 10; - } - if (clip_is_glm(ctx_clip)) { - num_max_patches = 1; - } - float * image_embd; - if (clip_is_qwen2vl(ctx_clip)) { - // qwen2vl don't split image into chunks, so `num_max_patches` is not needed. - image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny)); - } else { - image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model - } + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { - LOG_ERR("Unable to allocate memory for image embeddings\n"); + LOG_TEE("Unable to allocate memory for image embeddings\n"); return false; } int n_img_pos; if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { - LOG_ERR("%s: cannot encode image, aborting\n", __func__); + LOG_TEE("%s: cannot encode image, aborting\n", __func__); free(image_embd); return false; } @@ -436,51 +315,17 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co return true; } -struct llava_embd_batch { - std::vector pos; - std::vector n_seq_id; - std::vector seq_id_0; - std::vector seq_ids; - std::vector logits; - llama_batch batch; - llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); - logits .resize(n_tokens); - seq_id_0.resize(1); - seq_id_0[0] = seq_id; - seq_ids [n_tokens] = nullptr; - batch = { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), - /*logits =*/ logits.data(), - }; - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = pos_0 + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = false; - } - } -}; - bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { - int n_embd = llama_model_n_embd(llama_get_model(ctx_llama)); + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { int n_eval = image_embed->n_image_pos - i; if (n_eval > n_batch) { n_eval = n_batch; } - float * embd = image_embed->embed+i*n_embd; - llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0); - if (llama_decode(ctx_llama, llava_batch.batch)) { - LOG_ERR("%s : failed to eval\n", __func__); + llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; + if (llama_decode(ctx_llama, batch)) { + LOG_TEE("%s : failed to eval\n", __func__); return false; } *n_past += n_eval; @@ -492,7 +337,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); - LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__); + LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); return NULL; } @@ -501,7 +346,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_ERR("%s: couldn't embed the image\n", __func__); + LOG_TEE("%s: coulnd't embed the image\n", __func__); return NULL; } @@ -515,7 +360,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { auto file = fopen(path, "rb"); if (file == NULL) { - LOG_ERR("%s: can't read file %s\n", __func__, path); + LOG_TEE("%s: can't read file %s\n", __func__, path); return false; } @@ -525,7 +370,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data if (buffer == NULL) { - LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); perror("Memory allocation error"); fclose(file); return false; @@ -533,16 +378,10 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long errno = 0; size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer if (ferror(file)) { - LOG_ERR("read error: %s", strerror(errno)); - free(buffer); - fclose(file); - return false; + die_fmt("read error: %s", strerror(errno)); } if (ret != (size_t) fileSize) { - LOG_ERR("unexpectedly reached end of file"); - free(buffer); - fclose(file); - return false; + die("unexpectedly reached end of file"); } fclose(file); // Close the file @@ -556,7 +395,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); if (!loaded) { - LOG_ERR("%s: failed to load %s\n", __func__, image_path); + LOG_TEE("%s: failed to load %s\n", __func__, image_path); return NULL; } diff --git a/examples/llava/llava.h b/examples/llava/llava.h index b6feb3027..19212f6e9 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -17,11 +17,12 @@ # define LLAVA_API #endif +struct clip_ctx; + #ifdef __cplusplus extern "C" { #endif -struct clip_ctx; struct llava_image_embed { float * embed; int n_image_pos; @@ -36,8 +37,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -/** free an embedding made with llava_image_embed_make_* */ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); +/** free an embedding made with llava_image_embed_make_* */ /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp deleted file mode 100644 index 53d902d61..000000000 --- a/examples/llava/minicpmv-cli.cpp +++ /dev/null @@ -1,335 +0,0 @@ -#include "arg.h" -#include "log.h" -#include "common.h" -#include "sampling.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" -#include "ggml.h" - -#include -#include -#include -#include -#include -#include // TODO: remove me - -struct llava_context { - struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -static void show_additional_info(int /*argc*/, char ** argv) { - LOG("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static struct llama_model * llava_init(common_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = common_model_params_to_llama(*params); - - llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); - if (model == NULL) { - LOG_ERR("%s: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -static struct llava_context * llava_init_context(common_params * params, llama_model * model) { - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - llama_context_params ctx_params = common_context_params_to_llama(*params); - if (params->n_ctx < 2048) { - // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); - ctx_params.n_ctx = 2048; - } else { - ctx_params.n_ctx = params->n_ctx; - } - - llama_context * ctx_llama = llama_init_from_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_ERR("%s: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->model = model; - return ctx_llava; -} - -static void llava_free(struct llava_context * ctx_llava) { - if (ctx_llava->ctx_clip) { - clip_free(ctx_llava->ctx_clip); - ctx_llava->ctx_clip = NULL; - } - - llama_free(ctx_llava->ctx_llama); - llama_model_free(ctx_llava->model); - llama_backend_free(); -} - -static struct clip_ctx * clip_init_context(common_params * params) { - const char * clip_path = params->mmproj.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - return ctx_clip; -} - -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) { - LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - } - return true; -} - -static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); -} - -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); - return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); -} - -static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { - float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); - std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - - auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); - slice_embed->embed = image_embed; - slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); - llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); - llava_image_embed_free(slice_embed); -} - -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) { - std::string system_prompt; - int idx = 0; - int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); - int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); - if (has_minicpmv_projector == 2) { - system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; - } - else if (has_minicpmv_projector == 3) { - system_prompt = "<|im_start|>user\n"; - } - else if (has_minicpmv_projector == 4) { - system_prompt = "<|im_start|>user\n"; - } - LOG_INF("%s: image token past: %d\n", __func__, n_past); - eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (num_image_embeds > 1) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); - } - } - } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - } - LOG_INF("%s: image token past: %d\n", __func__, n_past); -} - -static const char * sample(struct common_sampler * smpl, - struct llama_context * ctx_llama, - int * n_past) { - const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); - common_sampler_accept(smpl, id, true); - - const llama_model * model = llama_get_model(ctx_llama); - const llama_vocab * vocab = llama_model_get_vocab(model); - - static std::string ret; - if (llama_vocab_is_eog(vocab, id)) { - ret = ""; - } else { - ret = common_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past); - return ret.c_str(); -} - -static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){ - auto * ctx_clip = clip_init_context(params); - auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); - if (!embeds) { - LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str()); - return NULL; - } - - // process the prompt - if (params->prompt.empty() && params->interactive == false) { - LOG_ERR("prompt should be given or interactive mode should be on"); - return NULL; - } - - auto * model = llava_init(params); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); - return NULL; - } - const int64_t t_llava_init_start_us = ggml_time_us(); - auto * ctx_llava = llava_init_context(params, model); - ctx_llava->ctx_clip = ctx_clip; - const int64_t t_llava_init_end_us = ggml_time_us(); - float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; - LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); - - const int64_t t_process_image_start_us = ggml_time_us(); - process_image(ctx_llava, embeds, params, n_past); - const int64_t t_process_image_end_us = ggml_time_us(); - float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; - LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); - - llava_image_embed_free(embeds); - return ctx_llava; -} - -static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){ - std::string user_prompt = prompt; - int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); - if (!is_first) { - if (has_minicpmv_projector == 2) { - user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; - } - else if (has_minicpmv_projector == 3) { - user_prompt = "<|im_start|>user\n" + prompt; - } - else if (has_minicpmv_projector == 4) { - user_prompt = "<|im_start|>user\n" + prompt; - } - } - - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - if (has_minicpmv_projector == 2) { - eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); - } - else if (has_minicpmv_projector == 3) { - eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); - } - else if (has_minicpmv_projector == 4) { - eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); - } - - // generate the response - - LOG_INF("\n"); - - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); - return smpl; -} - -static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){ - - const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); - return tmp; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { - return 1; - } - - common_init(); - - if (params.mmproj.empty() || (params.image.empty())) { - show_additional_info(argc, argv); - return 1; - } - - for (auto & image : params.image) { - int n_past = 0; - auto * ctx_llava = minicpmv_init(¶ms, image, n_past); - - if (!params.prompt.empty()) { - LOG("%s\n", params.prompt.c_str()); - LOG(""); - auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response; - bool have_tmp = false; - for (int i = 0; i < max_tgt_len; i++) { - const auto * tmp = llama_loop(ctx_llava, smpl, n_past); - response += tmp; - if (strcmp(tmp, "") == 0){ - if (!have_tmp) { - continue; - } - break; - } - if (strstr(tmp, "###")) break; // Yi-VL behavior - have_tmp = true; - printf("%s", tmp); - if (strstr(response.c_str(), "")) break; // minicpm-v - - fflush(stdout); - } - common_sampler_free(smpl); - }else { - while (true) { - LOG(""); - std::string prompt; - std::getline(std::cin, prompt); - LOG(""); - auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response; - for (int i = 0; i < max_tgt_len; i++) { - const auto * tmp = llama_loop(ctx_llava, smpl, n_past); - response += tmp; - if (strcmp(tmp, "") == 0) break; - printf("%s", tmp);// mistral llava-1.6 - if (strstr(response.c_str(), "")) break; // minicpm-v - fflush(stdout); - } - common_sampler_free(smpl); - } - } - printf("\n"); - llama_perf_context_print(ctx_llava->ctx_llama); - - ctx_llava->model = NULL; - llava_free(ctx_llava); - } - - return 0; -} diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py deleted file mode 100644 index 9b196757f..000000000 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ /dev/null @@ -1,815 +0,0 @@ -# coding=utf-8 -# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Siglip model. """ -# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes - - -import os -import math -import warnings - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn.init import _calculate_fan_in_and_fan_out - -from transformers.activations import ACT2FN -from transformers.modeling_utils import PreTrainedModel -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import ( - logging, -) -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -class SiglipVisionConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a - Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip - [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - num_channels (`int`, *optional*, defaults to 3): - Number of channels in the input images. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - Example: - ```python - >>> from transformers import SiglipVisionConfig, SiglipVisionModel - >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration - >>> configuration = SiglipVisionConfig() - >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration - >>> model = SiglipVisionModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "siglip_vision_model" - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - -_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" - -SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "google/siglip-base-patch16-224", - # See all SigLIP models at https://huggingface.co/models?filter=siglip -] - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -def _trunc_normal_(tensor, mean, std, a, b): - # Cut & paste from PyTorch official master until it's in a few official releases - RW - # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf - def norm_cdf(x): - # Computes standard normal cumulative distribution function - return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 - - if (mean < a - 2 * std) or (mean > b + 2 * std): - warnings.warn( - "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " - "The distribution of values may be incorrect.", - stacklevel=2, - ) - - # Values are generated by using a truncated uniform distribution and - # then using the inverse CDF for the normal distribution. - # Get upper and lower cdf values - l = norm_cdf((a - mean) / std) - u = norm_cdf((b - mean) / std) - - # Uniformly fill tensor with values from [l, u], then translate to - # [2l-1, 2u-1]. - tensor.uniform_(2 * l - 1, 2 * u - 1) - - # Use inverse cdf transform for normal distribution to get truncated - # standard normal - if tensor.dtype in [torch.float16, torch.bfloat16]: - # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu - og_dtype = tensor.dtype - tensor = tensor.to(torch.float32) - tensor.erfinv_() - tensor = tensor.to(og_dtype) - else: - tensor.erfinv_() - - # Transform to proper mean, std - tensor.mul_(std * math.sqrt(2.0)) - tensor.add_(mean) - - # Clamp to ensure it's in the proper range - if tensor.dtype == torch.float16: - # The `clamp_` op is not (yet?) defined in float16+cpu - tensor = tensor.to(torch.float32) - tensor.clamp_(min=a, max=b) - tensor = tensor.to(torch.float16) - else: - tensor.clamp_(min=a, max=b) - - -def trunc_normal_tf_( - tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0 -): - """Fills the input Tensor with values drawn from a truncated - normal distribution. The values are effectively drawn from the - normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` - with values outside :math:`[a, b]` redrawn until they are within - the bounds. The method used for generating the random values works - best when :math:`a \\leq \text{mean} \\leq b`. - NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the - bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 - and the result is subsquently scaled and shifted by the mean and std args. - Args: - tensor: an n-dimensional `torch.Tensor` - mean: the mean of the normal distribution - std: the standard deviation of the normal distribution - a: the minimum cutoff value - b: the maximum cutoff value - """ - with torch.no_grad(): - _trunc_normal_(tensor, 0, 1.0, a, b) - tensor.mul_(std).add_(mean) - - -def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) - denom = fan_in - if mode == "fan_in": - denom = fan_in - elif mode == "fan_out": - denom = fan_out - elif mode == "fan_avg": - denom = (fan_in + fan_out) / 2 - - variance = scale / denom - - if distribution == "truncated_normal": - # constant is stddev of standard normal truncated to (-2, 2) - trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) - elif distribution == "normal": - with torch.no_grad(): - tensor.normal_(std=math.sqrt(variance)) - elif distribution == "uniform": - bound = math.sqrt(3 * variance) - with torch.no_grad(): - tensor.uniform_(-bound, bound) - else: - raise ValueError(f"invalid distribution {distribution}") - - -def lecun_normal_(tensor): - variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") - - -def default_flax_embed_init(tensor): - variance_scaling_(tensor, mode="fan_in", distribution="normal") - -class SiglipVisionEmbeddings(nn.Module): - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - padding="valid", - ) - - self.num_patches_per_side = self.image_size // self.patch_size - self.num_patches = self.num_patches_per_side**2 - self.num_positions = self.num_patches - self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - -class SiglipAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - -# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip -class SiglipMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip -class SiglipEncoderLayer(nn.Module): - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" - self.self_attn = ( - SiglipAttention(config) - ) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - -class SiglipPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = SiglipVisionConfig - base_model_prefix = "siglip" - supports_gradient_checkpointing = True - - def _init_weights(self, module): - """Initialize the weights""" - - if isinstance(module, SiglipVisionEmbeddings): - width = self.config.hidden_size - nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) - elif isinstance(module, nn.Embedding): - default_flax_embed_init(module.weight) - elif isinstance(module, SiglipAttention): - nn.init.normal_(module.q_proj.weight) - nn.init.normal_(module.k_proj.weight) - nn.init.normal_(module.v_proj.weight) - nn.init.normal_(module.out_proj.weight) - nn.init.zeros_(module.q_proj.bias) - nn.init.zeros_(module.k_proj.bias) - nn.init.zeros_(module.v_proj.bias) - nn.init.zeros_(module.out_proj.bias) - elif isinstance(module, SiglipMLP): - nn.init.normal_(module.fc1.weight) - nn.init.normal_(module.fc2.weight) - nn.init.normal_(module.fc1.bias, std=1e-6) - nn.init.normal_(module.fc2.bias, std=1e-6) - elif isinstance(module, (nn.Linear, nn.Conv2d)): - lecun_normal_(module.weight) - if module.bias is not None: - nn.init.zeros_(module.bias) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -SIGLIP_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - Parameters: - config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -SIGLIP_VISION_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip -class SiglipEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`SiglipEncoderLayer`]. - Args: - config: SiglipConfig - """ - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - -class SiglipVisionTransformer(SiglipPreTrainedModel): - config_class = SiglipVisionConfig - main_input_name = "pixel_values" - _supports_flash_attn_2 = True - - def __init__(self, config: SiglipVisionConfig): - super().__init__(config) - self.config = config - embed_dim = config.hidden_size - - self.embeddings = SiglipVisionEmbeddings(config) - self.encoder = SiglipEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.embeddings.patch_embedding - -import argparse -import json -import re - -import numpy as np -from gguf import * -from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig - -TEXT = "clip.text" -VISION = "clip.vision" - - -def add_key_str(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: - if name in ( - "logit_scale", - "text_model.embeddings.position_ids", - "vision_model.embeddings.position_ids", - ): - return True - - if has_minicpmv and name in ["visual_projection.weight"]: - return True - - if name.startswith("v") and not has_vision: - return True - - if name.startswith("t") and not has_text: - return True - - return False - - -def get_tensor_name(name: str) -> str: - if "projection" in name: - return name - if "mm_projector" in name: - name = name.replace("model.mm_projector", "mm") - name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) - name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) - return name - - return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") - - -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) -ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") -ap.add_argument("--text-only", action="store_true", required=False, - help="Save a text-only model. It can't be used to encode images") -ap.add_argument("--vision-only", action="store_true", required=False, - help="Save a vision-only model. It can't be used to encode texts") -ap.add_argument("--clip-model-is-vision", action="store_true", required=False, - help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, - help="The clip model is from openclip (for ViT-SO400M type))") -ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") -ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) -# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 -# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.48145466, 0.4578275, 0.40821073] -default_image_std = [0.26862954, 0.26130258, 0.27577711] -ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) -ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) -ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2) - -# with proper -args = ap.parse_args() - - -if args.text_only and args.vision_only: - print("--text-only and --image-only arguments cannot be specified at the same time.") - exit(1) - -if args.use_f32: - print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") - -# output in the same directory as the model if output_dir is None -dir_model = args.model_dir - -if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: - vocab = None - tokens = None -else: - with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - vocab = json.load(f) - tokens = [key for key in vocab] - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if args.use_f32: - ftype = 0 - -# if args.clip_model_is_vision or args.clip_model_is_openclip: -# model = CLIPVisionModel.from_pretrained(dir_model) -# processor = None -# else: -# model = CLIPModel.from_pretrained(dir_model) -# processor = CLIPProcessor.from_pretrained(dir_model) - -minicpmv_version = args.minicpmv_version -emb_dim = 4096 -block_count = 26 -if minicpmv_version == 1: - emb_dim = 2304 - block_count = 26 -elif minicpmv_version == 2: - emb_dim = 4096 - block_count = 27 -elif minicpmv_version == 3: - emb_dim = 3584 - block_count = 27 -elif minicpmv_version == 4: - emb_dim = 3584 - block_count = 27 - -default_vision_config = { - "hidden_size": 1152, - "image_size": 980, - "intermediate_size": 4304, - "model_type": "idefics2", - "num_attention_heads": 16, - "num_hidden_layers": 27, - "patch_size": 14, - } - -vision_config = Idefics2VisionConfig(**default_vision_config) -model = Idefics2VisionTransformer(vision_config) -if minicpmv_version == 3: - vision_config = SiglipVisionConfig(**default_vision_config) - model = SiglipVisionTransformer(vision_config) -elif minicpmv_version == 4: - vision_config = SiglipVisionConfig(**default_vision_config) - model = SiglipVisionTransformer(vision_config) - -processor = None -# if model.attn_pool is not None: -# model.attn_pool = torch.nn.Identity() - -# model.blocks = model.blocks[:-1] -model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) - -fname_middle = None -has_text_encoder = True -has_vision_encoder = True -has_minicpmv_projector = False - -if args.text_only: - fname_middle = "text-" - has_vision_encoder = False -elif args.minicpmv_projector is not None: - fname_middle = "mmproj-" - has_text_encoder = False - has_minicpmv_projector = True - minicpmv_version = 4 -elif args.vision_only: - fname_middle = "vision-" - has_text_encoder = False -else: - fname_middle = "" - -output_dir = args.output_dir if args.output_dir is not None else dir_model -os.makedirs(output_dir, exist_ok=True) -output_prefix = os.path.basename(output_dir).replace("ggml_", "") -fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") -fout = GGUFWriter(path=fname_out, arch="clip") - -fout.add_bool("clip.has_text_encoder", has_text_encoder) -fout.add_bool("clip.has_vision_encoder", has_vision_encoder) -fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) -fout.add_file_type(ftype) -if args.text_only: - fout.add_description("text-only CLIP model") -elif args.vision_only and not has_minicpmv_projector: - fout.add_description("vision-only CLIP model") -elif has_minicpmv_projector: - fout.add_description("image encoder for MiniCPM-V") - # add projector type - fout.add_string("clip.projector_type", "resampler") - fout.add_int32("clip.minicpmv_version", minicpmv_version) -else: - fout.add_description("two-tower CLIP model") - -if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", 448) - fout.add_uint32("clip.vision.patch_size", 14) - fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) - fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) - fout.add_uint32("clip.vision.projection_dim", 0) - fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) - fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) - fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) - - if processor is not None: - image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean - image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std - else: - image_mean = args.image_mean if args.image_mean is not None else default_image_mean - image_std = args.image_std if args.image_std is not None else default_image_std - fout.add_array("clip.vision.image_mean", image_mean) - fout.add_array("clip.vision.image_std", image_std) - -use_gelu = True -fout.add_bool("clip.use_gelu", use_gelu) - -def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (M,) - out: (M, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2. - omega = 1. / 10000 ** omega # (D/2,) - - pos = pos.reshape(-1) # (M,) - out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product - - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - return emb - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - return emb - - -# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 -def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): - """ - grid_size: int of the grid height and width - return: - pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) - """ - if isinstance(grid_size, int): - grid_h_size, grid_w_size = grid_size, grid_size - else: - grid_h_size, grid_w_size = grid_size[0], grid_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - if cls_token: - pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) - return pos_embed - -def _replace_name_resampler(s, v): - if re.match("resampler.pos_embed", s): - return { - s: v, - re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), - } - if re.match("resampler.proj", s): - return { - re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), - re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), - } - if re.match("resampler.attn.in_proj_.*", s): - return { - re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], - re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], - re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], - } - return {s: v} - -if has_minicpmv_projector: - projector = torch.load(args.minicpmv_projector) - new_state_dict = {} - for k, v in projector.items(): - kvs = _replace_name_resampler(k, v) - for nk, nv in kvs.items(): - new_state_dict[nk] = nv - projector = new_state_dict - ftype_cur = 0 - for name, data in projector.items(): - name = get_tensor_name(name) - data = data.squeeze().numpy() - - n_dims = len(data.shape) - if ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - fout.add_tensor(name, data) - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - - print("Projector tensors added\n") - -def _replace_name(s, v): - s = "vision_model." + s - if re.match("vision_model.embeddings.position_embedding", s): - v = v.unsqueeze(0) - return {s: v} - - return {s: v} - -state_dict = model.state_dict() -new_state_dict = {} -for k, v in state_dict.items(): - kvs = _replace_name(k, v) - for nk, nv in kvs.items(): - new_state_dict[nk] = nv -state_dict = new_state_dict -for name, data in state_dict.items(): - if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): - # we don't need this - print(f"skipping parameter: {name}") - continue - - name = get_tensor_name(name) - data = data.squeeze().numpy() - - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if n_dims == 4: - print(f"tensor {name} is always saved in f16") - data = data.astype(np.float16) - ftype_cur = 1 - elif ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - fout.add_tensor(name, data) - - -fout.write_header_to_file() -fout.write_kv_data_to_file() -fout.write_tensors_to_file() -fout.close() - -print("Done. Output file: " + fname_out) diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py deleted file mode 100644 index ba8211658..000000000 --- a/examples/llava/minicpmv-surgery.py +++ /dev/null @@ -1,45 +0,0 @@ -import argparse -import os -import torch -from transformers import AutoModel, AutoTokenizer - -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", help="Path to MiniCPM-V model") -args = ap.parse_args() - -# find the model part that includes the the multimodal projector weights -model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16) -checkpoint = model.state_dict() - -# get a list of mm tensor names -mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] - -# store these tensors in a new dictionary and torch.save them -projector = {name: checkpoint[name].float() for name in mm_tensors} -torch.save(projector, f"{args.model}/minicpmv.projector") - -clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] -if len(clip_tensors) > 0: - clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} - torch.save(clip, f"{args.model}/minicpmv.clip") - - # added tokens should be removed to be able to convert Mistral models - if os.path.exists(f"{args.model}/added_tokens.json"): - with open(f"{args.model}/added_tokens.json", "w") as f: - f.write("{}\n") - -config = model.llm.config -config.auto_map = { - "AutoConfig": "configuration_minicpm.MiniCPMConfig", - "AutoModel": "modeling_minicpm.MiniCPMModel", - "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", - "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", - "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" -} -model.llm.save_pretrained(f"{args.model}/model") -tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) -tok.save_pretrained(f"{args.model}/model") - -print("Done!") -print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py deleted file mode 100644 index c87606b4f..000000000 --- a/examples/llava/qwen2_vl_surgery.py +++ /dev/null @@ -1,165 +0,0 @@ -import argparse -from typing import Dict - -import torch -import numpy as np -from gguf import * -from transformers import ( - Qwen2VLForConditionalGeneration, - Qwen2VLProcessor, - AutoProcessor, - Qwen2VLConfig -) - - -VISION = "clip.vision" - - -def k(raw_key: str, arch: str) -> str: - return raw_key.format(arch=arch) - - -def to_gguf_name(name: str) -> str: - og = name - name = name.replace("text_model", "t").replace("vision_model", "v") - name = name.replace("blocks", "blk").replace("embeddings.", "") - name = name.replace("attn.", "attn_") - name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.") - # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln") - name = name.replace("norm1", "ln1").replace("norm2", "ln2") - name = name.replace("merger.mlp", 'mm') - print(f"[to_gguf_name] {og} --> {name}") - return name - - -def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]: - vision_model = qwen2vl.visual - tensor_map = {} - for name, ten in vision_model.state_dict().items(): - ten = ten.numpy() - if 'qkv' in name: - if ten.ndim == 2: # weight - c3, _ = ten.shape - else: # bias - c3 = ten.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = ten[:c] - wk = ten[c: c * 2] - wv = ten[c * 2:] - tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq - tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk - tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv - elif 'merger' in name: - if name.endswith("ln_q.weight"): - tensor_map['v.post_ln.weight'] = ten - elif name.endswith("ln_q.bias"): - tensor_map['v.post_ln.bias'] = ten - else: - # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" - tensor_map[to_gguf_name(name)] = ten - elif 'patch_embed.proj.weight' in name: - # NOTE: split Conv3D into Conv2Ds - c1, c2, kt, kh, kw = ten.shape - assert kt == 2, "Current implmentation only support temporal_patch_size of 2" - tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] - tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] - else: - tensor_map[to_gguf_name(f"vision_model.{name}")] = ten - - for new_name, ten in tensor_map.items(): - if ten.ndim <= 1 or new_name.endswith("_norm.weight"): - tensor_map[new_name] = ten.astype(np.float32) - else: - tensor_map[new_name] = ten.astype(dtype) - tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder - return tensor_map - - -def main(args): - if args.data_type == 'fp32': - dtype = torch.float32 - np_dtype = np.float32 - ftype = 0 - elif args.data_type == 'fp16': - dtype = torch.float32 - np_dtype = np.float16 - ftype = 1 - else: - raise ValueError() - - local_model = False - model_path = "" - model_name = args.model_name - print("model_name: ", model_name) - qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( - model_name, torch_dtype=dtype, device_map="cpu" - ) - cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] - vcfg = cfg.vision_config - - if os.path.isdir(model_name): - local_model = True - if model_name.endswith(os.sep): - model_name = model_name[:-1] - model_path = model_name - model_name = os.path.basename(model_name) - fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf" - - fout = GGUFWriter(path=fname_out, arch="clip") - fout.add_description("image encoder for Qwen2VL") - - fout.add_file_type(ftype) - fout.add_bool("clip.has_text_encoder", False) - fout.add_bool("clip.has_vision_encoder", True) - fout.add_bool("clip.has_qwen2vl_merger", True) - fout.add_string("clip.projector_type", "qwen2vl_merger") - - print(cfg.vision_config) - if 'silu' in cfg.vision_config.hidden_act.lower(): - fout.add_bool("clip.use_silu", True) - fout.add_bool("clip.use_gelu", False) - elif 'gelu' in cfg.vision_config.hidden_act.lower(): - fout.add_bool("clip.use_silu", False) - fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower()) - else: - raise ValueError() - - tensor_map = find_vision_tensors(qwen2vl, np_dtype) - for name, data in tensor_map.items(): - fout.add_tensor(name, data) - - fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) - fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim) - fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) - fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder - fout.add_name(model_name) - """ - HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig, - it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`. - """ - - if local_model: - processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path) - else: - processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) - fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue] - fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue] - - fout.write_header_to_file() - fout.write_kv_data_to_file() - fout.write_tensors_to_file() - fout.close() - print("save model as: ", fname_out) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct") - parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") - args = parser.parse_args() - main(args) diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp deleted file mode 100644 index 132a7da54..000000000 --- a/examples/llava/qwen2vl-cli.cpp +++ /dev/null @@ -1,584 +0,0 @@ -#include "arg.h" -#include "base64.hpp" -#include "log.h" -#include "common.h" -#include "sampling.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" -#include "ggml.h" - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif -#ifdef NDEBUG -#include "ggml-alloc.h" -#include "ggml-backend.h" -#endif - -#include -#include -#include -#include -#include -#include -#include - - -static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, - int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { - int n_embd = llama_model_n_embd(llama_get_model(ctx_llama)); - const int patch_size = 14 * 2; - const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); - const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); - auto img_tokens = image_embed->n_image_pos; - // llama_pos mrope_pos[img_tokens * 4]; - std::vector mrope_pos; - mrope_pos.resize(img_tokens * 4); - - for (int y = 0; y < ph; y++) - { - for (int x = 0; x < pw; x++) - { - int i = y * pw + x; - mrope_pos[i] = *st_pos_id; - mrope_pos[i + img_tokens] = *st_pos_id + y; - mrope_pos[i + img_tokens * 2] = *st_pos_id + x; - mrope_pos[i + img_tokens * 3] = 0; - } - } - *st_pos_id += std::max(pw, ph); - - int processed = 0; - std::vector batch_mrope_pos; - batch_mrope_pos.resize(img_tokens * 4); - - for (int i = 0; i < img_tokens; i += n_batch) { - int n_eval = img_tokens - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - - // llama_pos batch_mrope_pos[n_eval * 4]; - std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0); - memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos)); - memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos)); - memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos)); - memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos)); - - llama_batch batch = { - int32_t(n_eval), // n_tokens - nullptr, // token - (image_embed->embed+i*n_embd), // embed - batch_mrope_pos.data(), // pos - nullptr, // n_seq_id - nullptr, // seq_id - nullptr, // logits - }; - - if (llama_decode(ctx_llama, batch)) { - LOG_ERR("%s : failed to eval\n", __func__); - return false; - } - *n_past += n_eval; - processed += n_eval; - } - return true; -} - - -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past, int * st_pos_id) { - int N = (int) tokens.size(); - std::vector pos; - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - auto batch = llama_batch_get_one(&tokens[i], n_eval); - // TODO: add mrope pos ids somewhere else - pos.resize(batch.n_tokens * 4); - std::fill(pos.begin(), pos.end(), 0); - for (int j = 0; j < batch.n_tokens * 3; j ++) { - pos[j] = *st_pos_id + (j % batch.n_tokens); - } - batch.pos = pos.data(); - - if (llama_decode(ctx_llama, batch)) { - LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - *st_pos_id += n_eval; - } - return true; -} - -static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id); -} - -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); - eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id); - return true; -} - -static const char * sample(struct common_sampler * smpl, - struct llama_context * ctx_llama, - int * n_past, int * st_pos_id) { - const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); - common_sampler_accept(smpl, id, true); - - const llama_model * model = llama_get_model(ctx_llama); - const llama_vocab * vocab = llama_model_get_vocab(model); - - static std::string ret; - if (llama_vocab_is_eog(vocab, id)) { - ret = ""; - } else { - ret = common_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past, st_pos_id); - return ret.c_str(); -} - -static const char* IMG_BASE64_TAG_BEGIN = ""; - -static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { - begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); - end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); -} - -static bool prompt_contains_image(const std::string& prompt) { - size_t begin, end; - find_image_tag_in_prompt(prompt, begin, end); - return (begin != std::string::npos); -} - -// replaces the base64 image tag in the prompt with `replacement` -static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) { - size_t img_base64_str_start, img_base64_str_end; - find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); - if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { - LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); - return NULL; - } - - auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN); - auto base64_bytes_count = img_base64_str_end - base64_bytes_start; - auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count ); - - auto required_bytes = base64::required_encode_size(base64_str.size()); - auto img_bytes = std::vector(required_bytes); - base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); - - auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); - if (!embed) { - LOG_ERR("%s: could not load image from base64 string.\n", __func__); - return NULL; - } - - return embed; -} - -static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { - size_t begin, end; - find_image_tag_in_prompt(prompt, begin, end); - if (begin == std::string::npos || end == std::string::npos) { - return prompt; - } - auto pre = prompt.substr(0, begin); - auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END)); - return pre + replacement + post; -} - -struct llava_context { - struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -static void print_usage(int, char ** argv) { - LOG("\n example usage:\n"); - LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { - - // load and preprocess the image - llava_image_embed * embed = NULL; - auto prompt = params->prompt; - if (prompt_contains_image(prompt)) { - if (!params->image.empty()) { - LOG_INF("using base64 encoded image instead of command line image path\n"); - } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); - if (!embed) { - LOG_ERR("%s: can't load image from prompt\n", __func__); - return NULL; - } - params->prompt = remove_image_from_prompt(prompt); - } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); - if (!embed) { - fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); - return NULL; - } - } - - return embed; -} - -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { - int n_past = 0; - int cur_pos_id = 0; - - const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - - std::string system_prompt, user_prompt; - size_t image_pos = prompt.find("<|vision_start|>"); - if (image_pos != std::string::npos) { - // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - system_prompt = prompt.substr(0, image_pos); - user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length()); - LOG_INF("system_prompt: %s\n", system_prompt.c_str()); - if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - LOG_INF("user_prompt: %s\n", user_prompt.c_str()); - if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - } else { - // llava-1.5 native mode - system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>"; - user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n"; - if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - } - - eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true); - if (image_embed != nullptr) { - auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip); - qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size); - } - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false); - - // generate the response - - LOG("\n"); - - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); - exit(1); - } - - std::string response = ""; - for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id); - response += tmp; - if (strcmp(tmp, "") == 0) break; - if (strstr(tmp, "###")) break; // Yi-VL behavior - LOG("%s", tmp); - if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) - if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 - if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 - - fflush(stdout); - } - - common_sampler_free(smpl); - LOG("\n"); -} - -static struct llama_model * llava_init(common_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = common_model_params_to_llama(*params); - - llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); - if (model == NULL) { - LOG_ERR("%s: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -static struct llava_context * llava_init_context(common_params * params, llama_model * model) { - const char * clip_path = params->mmproj.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - - llama_context_params ctx_params = common_context_params_to_llama(*params); - ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - - llama_context * ctx_llama = llama_init_from_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_ERR("%s: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->ctx_clip = ctx_clip; - ctx_llava->model = model; - return ctx_llava; -} - -static void llava_free(struct llava_context * ctx_llava) { - if (ctx_llava->ctx_clip) { - clip_free(ctx_llava->ctx_clip); - ctx_llava->ctx_clip = NULL; - } - - llama_free(ctx_llava->ctx_llama); - llama_model_free(ctx_llava->model); - llama_backend_free(); -} - -#ifndef NDEBUG - -static void debug_test_mrope_2d() { - // 1. Initialize backend - ggml_backend_t backend = NULL; - std::string backend_name = ""; -#ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - backend = ggml_backend_cuda_init(0); // init device 0 - backend_name = "cuda"; - if (!backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } -#endif - // if there aren't GPU Backends fallback to CPU backend - if (!backend) { - backend = ggml_backend_cpu_init(); - backend_name = "cpu"; - } - - // Calculate the size needed to allocate - size_t ctx_size = 0; - ctx_size += 2 * ggml_tensor_overhead(); // tensors - // no need to allocate anything else! - - // 2. Allocate `ggml_context` to store tensor data - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors() - }; - struct ggml_context * ctx = ggml_init(params); - - struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - - struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4); - ggml_set_name(pos, "pos"); - ggml_set_input(pos); - - std::vector dummy_q; - dummy_q.resize(128 * 12 * 30); - std::fill(dummy_q.begin(), dummy_q.end(), 0.1); - // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw)); - - std::vector pos_id; - pos_id.resize(30 * 4); - for (int i = 0; i < 30; i ++) { - pos_id[i] = i; - pos_id[i + 30] = i + 10; - pos_id[i + 60] = i + 20; - pos_id[i + 90] = i + 30; - } - int sections[4] = {32, 32, 0, 0}; - - // 4. Allocate a `ggml_backend_buffer` to store all tensors - ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - - // 5. Copy tensor data from main memory (RAM) to backend buffer - ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw)); - ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos)); - - // 6. Create a `ggml_cgraph` for mul_mat operation - struct ggml_cgraph * gf = NULL; - struct ggml_context * ctx_cgraph = NULL; - - // create a temporally context to build the graph - struct ggml_init_params params0 = { - /*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() - }; - ctx_cgraph = ggml_init(params0); - gf = ggml_new_graph(ctx_cgraph); - - struct ggml_tensor * result0 = ggml_rope_multi( - ctx_cgraph, inp_raw, pos, nullptr, - 128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1, - 0, 1, 32, 1); - - // Add "result" tensor and all of its dependencies to the cgraph - ggml_build_forward_expand(gf, result0); - - // 7. Create a `ggml_gallocr` for cgraph computation - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); - ggml_gallocr_alloc_graph(allocr, gf); - - // 9. Run the computation - int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); - } - ggml_backend_graph_compute(backend, gf); - - // 10. Retrieve results (output tensors) - // in this example, output tensor is always the last tensor in the graph - struct ggml_tensor * result = result0; - // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; - float * result_data = (float *)malloc(ggml_nbytes(result)); - // because the tensor data is stored in device buffer, we need to copy it back to RAM - ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result)); - const std::string bin_file = "mrope_2d_" + backend_name +".bin"; - std::ofstream outFile(bin_file, std::ios::binary); - - if (outFile.is_open()) { - outFile.write(reinterpret_cast(result_data), ggml_nbytes(result)); - outFile.close(); - std::cout << "Data successfully written to " + bin_file << std::endl; - } else { - std::cerr << "Error opening file!" << std::endl; - } - - free(result_data); - // 11. Free memory and exit - ggml_free(ctx_cgraph); - ggml_gallocr_free(allocr); - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - ggml_backend_free(backend); -} - -static void debug_dump_img_embed(struct llava_context * ctx_llava) { - int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama)); - int ne = n_embd * 4; - float vals[56 * 56 * 3]; - // float embd[ne]; - std::vector embd; - embd.resize(ne); - - for (int i = 0; i < 56*56; i++) - { - for (int c = 0; c < 3; c++) - vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56); - } - - clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data()); - - std::ofstream outFile("img_embed.bin", std::ios::binary); - if (outFile.is_open()) { - outFile.write(reinterpret_cast(embd.data()), ne * sizeof(float)); - - outFile.close(); - std::cout << "Data successfully written to mrope.bin" << std::endl; - } else { - std::cerr << "Error opening file!" << std::endl; - } -} - -#endif - - -int main(int argc, char ** argv) { - ggml_time_init(); - - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { - return 1; - } - - common_init(); - - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv); - return 1; - } - - auto * model = llava_init(¶ms); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init llava model\n", __func__); - return 1; - } - - if (prompt_contains_image(params.prompt)) { - auto * ctx_llava = llava_init_context(¶ms, model); - - auto * image_embed = load_image(ctx_llava, ¶ms, ""); - - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_perf_context_print(ctx_llava->ctx_llama); - llava_image_embed_free(image_embed); - ctx_llava->model = NULL; - llava_free(ctx_llava); -#ifndef NDEBUG - } else if (params.image[0].empty()) { - auto ctx_llava = llava_init_context(¶ms, model); - - debug_test_mrope_2d(); - debug_dump_img_embed(ctx_llava); - - llama_perf_context_print(ctx_llava->ctx_llama); - ctx_llava->model = NULL; - llava_free(ctx_llava); -#endif - } else { - for (auto & image : params.image) { - auto * ctx_llava = llava_init_context(¶ms, model); - - auto * image_embed = load_image(ctx_llava, ¶ms, image); - if (!image_embed) { - LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); - return 1; - } - - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_perf_context_print(ctx_llava->ctx_llama); - llava_image_embed_free(image_embed); - ctx_llava->model = NULL; - llava_free(ctx_llava); - } - } - - llama_model_free(model); - - return 0; -} diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index cbcbf26c9..fbfd0cdd7 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,4 +2,3 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 -torchvision~=0.17.1 diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt index 346861314..f0ae5cd89 100644 --- a/examples/lookahead/CMakeLists.txt +++ b/examples/lookahead/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-lookahead) add_executable(${TARGET} lookahead.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 2f0898e62..fb20ad93f 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,9 +1,7 @@ -#include "arg.h" #include "common.h" -#include "sampling.h" -#include "log.h" #include "llama.h" +#include #include #include #include @@ -37,51 +35,54 @@ struct ngram_container { }; int main(int argc, char ** argv) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - const int W = 15; // lookahead window const int N = 5; // n-gram size const int G = 15; // max verification n-grams const bool dump_kv_cache = params.dump_kv_cache; +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("lookahead", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); + llama_model * model = NULL; + llama_context * ctx = NULL; + // load the target model - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - - const llama_vocab * vocab = llama_model_get_vocab(model); + std::tie(model, ctx) = llama_init_from_gpt_params(params); // Tokenize the prompt std::vector inp; std::vector all; - inp = common_tokenize(ctx, params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true); all = inp; const int max_context_size = llama_n_ctx(ctx); const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - LOG("\n\n"); + fprintf(stderr, "\n\n"); for (auto id : inp) { - LOG("%s", common_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -91,8 +92,8 @@ int main(int argc, char ** argv) { const auto t_enc_start = ggml_time_us(); // eval the prompt - llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1)); - llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); + llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); + llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); for (int s = 1; s < W + G + 1; ++s) { llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); @@ -117,7 +118,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct common_sampler * smpl = common_sampler_init(model, params.sampling); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); // verification n-grams std::vector ngrams_cur(G); @@ -149,7 +150,7 @@ int main(int argc, char ** argv) { } // here we keep adding new n-grams as we go - ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G); + ngram_container ngrams_observed(llama_n_vocab(model), N, G); // debug struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1); @@ -158,14 +159,14 @@ int main(int argc, char ** argv) { // sample first token { - id = common_sampler_sample(smpl, ctx, 0); + id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0); - common_sampler_accept(smpl, id, true); + llama_sampling_accept(ctx_sampling, ctx, id, true); { - const std::string token_str = common_token_to_piece(ctx, id); + const std::string token_str = llama_token_to_piece(ctx, id); - LOG("%s", token_str.c_str()); + printf("%s", token_str.c_str()); fflush(stdout); } } @@ -174,7 +175,7 @@ int main(int argc, char ** argv) { // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - common_kv_cache_dump_view_seqs(kvc_view, 40); + llama_kv_cache_dump_view_seqs(kvc_view, 40); } // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ @@ -203,10 +204,10 @@ int main(int argc, char ** argv) { // V V V V V V // id { - common_batch_clear(batch); + llama_batch_clear(batch); // current token - first token of the first level - common_batch_add(batch, id, n_past, seq_id_all, true); + llama_batch_add(batch, id, n_past, seq_id_all, true); // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation { @@ -231,7 +232,7 @@ int main(int argc, char ** argv) { ngrams_cur[g].tokens [j + 1] = t; ngrams_cur[g].i_batch[j + 1] = batch.n_tokens; - common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); + llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); } } } @@ -243,19 +244,19 @@ int main(int argc, char ** argv) { seq_id_look[j] = i + j + 1; } - common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); + llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); } // fill the rest of the levels for (int j = 1; j < N - 1; j++) { for (int i = 0; i < W; i++) { - common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); + llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); } } } if (llama_decode(ctx, batch) != 0) { - LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__); + fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); return 1; } @@ -283,23 +284,23 @@ int main(int argc, char ** argv) { } // sample the next token - id = common_sampler_sample(smpl, ctx, i_batch); + id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch); - common_sampler_accept(smpl, id, true); + llama_sampling_accept(ctx_sampling, ctx, id, true); // print { - const std::string token_str = common_token_to_piece(ctx, id); + const std::string token_str = llama_token_to_piece(ctx, id); if (v == 0) { - LOG("%s", token_str.c_str()); + printf("%s", token_str.c_str()); } else { // print light cyan - LOG("\033[0;96m%s\033[0m", token_str.c_str()); + printf("\033[0;96m%s\033[0m", token_str.c_str()); } fflush(stdout); - if (llama_vocab_is_eog(vocab, id)) { + if (llama_token_is_eog(model, id)) { has_eos = true; } @@ -329,21 +330,21 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str()); + printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { - LOG(" - ngram %2d: ", i); + printf(" - ngram %2d: ", i); const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { - const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); + const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); - LOG("%s", token_str.c_str()); + printf("%s", token_str.c_str()); } - LOG("\n"); + printf("\n"); } } @@ -360,7 +361,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); } } else { for (int i = 0; i < W; i++) { @@ -454,31 +455,32 @@ int main(int argc, char ** argv) { auto t_dec_end = ggml_time_us(); - LOG("\n\n"); + LOG_TEE("\n\n"); - LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_INF("\n"); - LOG_INF("W = %2d\n", W); - LOG_INF("N = %2d\n", N); - LOG_INF("G = %2d\n", G); - LOG_INF("\n"); - LOG_INF("n_predict = %d\n", n_predict); - LOG_INF("n_accept = %d\n", n_accept); + LOG_TEE("\n"); + LOG_TEE("W = %2d\n", W); + LOG_TEE("N = %2d\n", N); + LOG_TEE("G = %2d\n", G); + LOG_TEE("\n"); + LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("n_accept = %d\n", n_accept); - LOG_INF("\n"); - common_perf_print(ctx, smpl); - - common_sampler_free(smpl); + llama_print_timings(ctx); llama_kv_cache_view_free(&kvc_view); + llama_sampling_free(ctx_sampling); llama_batch_free(batch); + llama_free(ctx); + llama_free_model(model); + llama_backend_free(); - LOG("\n\n"); + fprintf(stderr, "\n\n"); return 0; } diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index fba78ceda..ef19fe25e 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -2,22 +2,22 @@ set(TARGET llama-lookup) add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TARGET llama-lookup-create) add_executable(${TARGET} lookup-create.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TARGET llama-lookup-merge) add_executable(${TARGET} lookup-merge.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TARGET llama-lookup-stats) add_executable(${TARGET} lookup-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 3da45ed9e..d713f6f21 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -1,15 +1,20 @@ -#include "arg.h" +#include "ggml.h" +#include "llama.h" #include "common.h" #include "ngram-cache.h" -#include "llama.h" +#include +#include +#include #include +#include #include int main(int argc, char ** argv){ - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } @@ -17,24 +22,22 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); + llama_model * model = NULL; + llama_context * ctx = NULL; + // load the model - common_init_result llama_init = common_init_from_params(params); - - llama_model_ptr & model = llama_init.model; - llama_context_ptr & ctx = llama_init.context; - + std::tie(model, ctx) = llama_init_from_gpt_params(params); GGML_ASSERT(model != nullptr); // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx.get(), params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true); fprintf(stderr, "%s: tokenization done\n", __func__); - common_ngram_cache ngram_cache; - common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); + + llama_ngram_cache ngram_cache; + llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); - common_ngram_cache_save(ngram_cache, params.lookup_cache_static); - - return 0; + llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); } diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp index 6871c0f5f..81e2b0436 100644 --- a/examples/lookup/lookup-merge.cpp +++ b/examples/lookup/lookup-merge.cpp @@ -33,15 +33,15 @@ int main(int argc, char ** argv){ } fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); - common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]); + llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); for (size_t i = 1; i < args.size()-1; ++i) { fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); - common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]); + llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); - common_ngram_cache_merge(ngram_cache_merged, ngram_cache); + llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); } fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str()); - common_ngram_cache_save(ngram_cache_merged, args.back()); + llama_ngram_cache_save(ngram_cache_merged, args.back()); } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index fcb289abe..2fe67100e 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -1,45 +1,44 @@ -#include "arg.h" +#include "ggml.h" #include "common.h" +#include "llama.h" #include "log.h" #include "ngram-cache.h" -#include "llama.h" -#include "ggml.h" +#include #include #include -#include #include #include #include +#include int main(int argc, char ** argv){ - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - - const int n_draft = params.speculative.n_max; + const int n_draft = params.n_draft; // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); - // load the model - common_init_result llama_init = common_init_from_params(params); + llama_model * model = NULL; + llama_context * ctx = NULL; - llama_context_ptr & ctx = llama_init.context; + // load the model + std::tie(model, ctx) = llama_init_from_gpt_params(params); // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx.get(), params.prompt, true, true); - - common_ngram_cache ngram_cache_context; - common_ngram_cache ngram_cache_dynamic; - common_ngram_cache ngram_cache_static; + inp = ::llama_tokenize(ctx, params.prompt, true, true); + llama_ngram_cache ngram_cache_context; + llama_ngram_cache ngram_cache_dynamic; + llama_ngram_cache ngram_cache_static; int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; @@ -48,16 +47,16 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_static.empty()) { try { - ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static); + ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } if (!params.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic); + ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -65,7 +64,7 @@ int main(int argc, char ** argv){ } const int n_input = inp.size(); - const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx = llama_n_ctx(ctx); int n_drafted = 0; int n_accept = 0; @@ -86,7 +85,7 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); t_draft_us += ggml_time_us() - t_start_draft_us; } @@ -105,7 +104,7 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } } @@ -115,7 +114,7 @@ int main(int argc, char ** argv){ pseudo_output.push_back(inp_slice[pseudo_output.size()]); { const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } } @@ -129,29 +128,32 @@ int main(int argc, char ** argv){ const int64_t eta_min = eta_ms / (60*1000); const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; - LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); + LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); } // After each chunk, update the dynamic ngram cache with the context ngram cache: - common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); ngram_cache_context.clear(); } - LOG("\n"); + LOG_TEE("\n"); - LOG_INF("\n"); - LOG_INF("n_draft = %d\n", n_draft); - LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx); - LOG_INF("n_drafted = %d\n", n_drafted); - LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_TEE("\n"); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_INF("n_accept = %d\n", n_accept); - LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + + llama_free(ctx); + llama_free_model(model); llama_backend_free(); - LOG("\n\n"); + fprintf(stderr, "\n\n"); return 0; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index dbd0444ec..bb571bac4 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,70 +1,72 @@ -#include "arg.h" #include "ggml.h" +#include "llama.h" #include "common.h" #include "ngram-cache.h" -#include "sampling.h" -#include "log.h" -#include "llama.h" +#include #include #include #include #include #include +#include int main(int argc, char ** argv){ - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - // max. number of additional tokens to draft if match is found - const int n_draft = params.speculative.n_max; + const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache; +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("lookup", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); + llama_model * model = NULL; + llama_context * ctx = NULL; + // load the model - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - - const llama_vocab * vocab = llama_model_get_vocab(model); + std::tie(model, ctx) = llama_init_from_gpt_params(params); // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx, params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true); - common_ngram_cache ngram_cache_context; - common_ngram_cache ngram_cache_dynamic; - common_ngram_cache ngram_cache_static; + llama_ngram_cache ngram_cache_context; + llama_ngram_cache ngram_cache_dynamic; + llama_ngram_cache ngram_cache_static; int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; { // Fill up context ngram cache with tokens from user input: const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); if (!params.lookup_cache_static.empty()) { try { - ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static); + ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } if (!params.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic); + ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -75,14 +77,14 @@ int main(int argc, char ** argv){ const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - LOG("\n\n"); + fprintf(stderr, "\n\n"); for (auto id : inp) { - LOG("%s", common_token_to_piece(ctx, id).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -91,8 +93,8 @@ int main(int argc, char ** argv){ const auto t_enc_start = ggml_time_us(); - llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1)); - llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); + llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); + llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); const auto t_enc_end = ggml_time_us(); @@ -104,7 +106,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct common_sampler * smpl = common_sampler_init(model, params.sampling); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); std::vector draft; @@ -119,26 +121,26 @@ int main(int argc, char ** argv){ // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - common_kv_cache_dump_view_seqs(kvc_view, 40); + llama_kv_cache_dump_view_seqs(kvc_view, 40); } // print current draft sequence - LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str()); + LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str()); int i_dft = 0; while (true) { // sample from the target model - llama_token id = common_sampler_sample(smpl, ctx, i_dft); + llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft); - common_sampler_accept(smpl, id, true); + llama_sampling_accept(ctx_sampling, ctx, id, true); - const std::string token_str = common_token_to_piece(ctx, id); + const std::string token_str = llama_token_to_piece(ctx, id); if (!params.use_color) { - LOG("%s", token_str.c_str()); + printf("%s", token_str.c_str()); } - if (llama_vocab_is_eog(vocab, id)) { + if (llama_token_is_eog(model, id)) { has_eos = true; } @@ -146,7 +148,7 @@ int main(int argc, char ** argv){ // check if the target token matches the draft if (i_dft < (int) draft.size() && id == draft[i_dft]) { - LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); + LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); ++n_accept; ++n_past; ++i_dft; @@ -154,25 +156,25 @@ int main(int argc, char ** argv){ { // Update context ngram cache with the newly accepted token: const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } if (params.use_color) { // color accepted draft token - LOG("\033[34m%s\033[0m", token_str.c_str()); + printf("\033[34m%s\033[0m", token_str.c_str()); fflush(stdout); } continue; } if (params.use_color) { - LOG("%s", token_str.c_str()); + printf("%s", token_str.c_str()); } fflush(stdout); - LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); + LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); draft.clear(); draft.push_back(id); @@ -180,7 +182,7 @@ int main(int argc, char ** argv){ { // Update context ngram cache with the newly accepted token: const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } break; @@ -194,18 +196,18 @@ int main(int argc, char ** argv){ // clean the cache of draft tokens that weren't accepted llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - common_batch_clear(batch_tgt); - common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); + llama_batch_clear(batch_tgt); + llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); // Draft already contains a single token sampled from the model: GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft[0] == inp.back()); const int64_t t_start_draft_us = ggml_time_us(); - common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); for (size_t i = 1; i < draft.size(); ++i) { - common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); } t_draft_us += ggml_time_us() - t_start_draft_us; @@ -220,34 +222,36 @@ int main(int argc, char ** argv){ auto t_dec_end = ggml_time_us(); // Update dynamic ngram cache with context ngram cache and save it to disk: - common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); - common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); + llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); - LOG("\n\n"); + LOG_TEE("\n\n"); - LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_INF("\n"); - LOG_INF("n_draft = %d\n", n_draft); - LOG_INF("n_predict = %d\n", n_predict); - LOG_INF("n_drafted = %d\n", n_drafted); - LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_TEE("\n"); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_INF("n_accept = %d\n", n_accept); - LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_INF("\ntarget:\n\n"); - common_perf_print(ctx, smpl); - - common_sampler_free(smpl); + LOG_TEE("\ntarget:\n"); + llama_print_timings(ctx); + llama_sampling_free(ctx_sampling); llama_batch_free(batch_tgt); + llama_free(ctx); + llama_free_model(model); + llama_backend_free(); - LOG("\n\n"); + fprintf(stderr, "\n\n"); return 0; } diff --git a/examples/simple-cmake-pkg/.gitignore b/examples/main-cmake-pkg/.gitignore similarity index 100% rename from examples/simple-cmake-pkg/.gitignore rename to examples/main-cmake-pkg/.gitignore diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt new file mode 100644 index 000000000..3b38db292 --- /dev/null +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.12) +project("llama-cli-cmake-pkg" C CXX) +set(TARGET llama-cli-cmake-pkg) + +find_package(Llama 0.0.1 REQUIRED) + +# Bake common functionality in with target. Because applications +# using the relocatable Llama package should be outside of the +# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in. +set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") +add_library(common OBJECT) +file(GLOB _common_files + "${_common_path}/*.h" + "${_common_path}/*.cpp" +) +target_sources(common PRIVATE ${_common_files}) + +# If the common project was part of "llama-cli-cmake-pkg" the transient +# defines would automatically be attached. Because the common func- +# tionality is separate, but dependent upon the defines, it must be +# explicitly extracted from the "llama" target. +# +get_target_property(_llama_transient_defines llama + INTERFACE_COMPILE_DEFINITIONS) + +target_compile_definitions(common PRIVATE "${_llama_transient_defines}") + +add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) +target_include_directories(${TARGET} PRIVATE ${_common_path}) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md new file mode 100644 index 000000000..08d83dd08 --- /dev/null +++ b/examples/main-cmake-pkg/README.md @@ -0,0 +1,31 @@ +# llama.cpp/example/main-cmake-pkg + +This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. + +## Building + +Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions. + +### Considerations + +When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package. + +### Build llama.cpp and install to C:\LlamaCPP directory + +```cmd +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64 +cmake --build build --config Release +cmake --install build --prefix C:/LlamaCPP +``` + +### Build llama-cli-cmake-pkg + + +```cmd +cd ..\examples\main-cmake-pkg +cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 +cmake --build build --config Release +cmake --install build --prefix C:/MyLlamaApp +``` diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index af3d9150f..5f6efaa9a 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/README.md b/examples/main/README.md index ceaed42f6..9396a34fa 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 +./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 ``` ### Windows: @@ -66,10 +66,10 @@ In this section, we cover the most commonly used options for running the `llama- - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference. +- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\' - `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. -- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. ## Input Prompts @@ -131,7 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th ### Context Size -- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference. +- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference. ### Extended Context Size @@ -161,8 +161,6 @@ A value of -1 will enable infinite text generation, even though we have a finite If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. -The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full. - It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. ### Temperature @@ -177,34 +175,15 @@ Example usage: `--temp 0` - `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled). - `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). +- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty. The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1. The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). -### DRY Repetition Penalty +Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases. -DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)). - -- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled). -- `--dry-base N`: Set the DRY sampling base value (default: 1.75). -- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2). -- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size). -- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used. - -The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8. - -The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions. - -The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words. - -The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens. - -The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied. - -DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence. - -Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"` +Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl` ### Top-K Sampling @@ -230,6 +209,14 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e Example usage: `--min-p 0.05` +### Tail-Free Sampling (TFS) + +- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). + +Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS. + +Example usage: `--tfs 0.95` + ### Locally Typical Sampling - `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). @@ -252,19 +239,6 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` -### XTC Sampling - -- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0). -- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1). - -Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one. - -By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models. - -Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`. - -Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1` - ### Logit Bias - `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. @@ -308,11 +282,15 @@ These options help improve the performance and memory usage of the LLaMA models. These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. +### Memory Float 32 + +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended. + ### Batch Size -- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`. +- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. -- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`. +- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`. ### Prompt Caching @@ -328,22 +306,14 @@ These options help improve the performance and memory usage of the LLaMA models. For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize). -## LoRA (Low-Rank Adaptation) adapters - -- `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters. -- `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters. - -You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`. - -LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed. - ## Additional Options These options provide extra functionality and customization when running the LLaMA models: - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `--verbose-prompt`: Print the prompt before generating text. -- `--no-display-prompt`: Don't print prompt at generation. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e654d3542..a0d817b1a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,11 +1,11 @@ -#include "arg.h" #include "common.h" -#include "console.h" -#include "log.h" -#include "sampling.h" -#include "llama.h" -#include "chat-template.hpp" +#include "console.h" +#include "llama.h" + +#include +#include +#include #include #include #include @@ -31,27 +31,15 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant"; - static llama_context ** g_ctx; static llama_model ** g_model; -static common_sampler ** g_smpl; -static common_params * g_params; +static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; -static void print_usage(int argc, char ** argv) { - (void) argc; - - LOG("\nexample usage:\n"); - LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); - LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); - LOG("\n"); -} - static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); @@ -64,6 +52,50 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = string_get_sortable_timestamp(); + + const bool success = fs_create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + yaml_dump_string_multiline(logfile, "output", output.c_str()); + yaml_dump_vector_int(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -72,29 +104,49 @@ static void sigint_handler(int signo) { need_insert_eot = true; } else { console::cleanup(); - LOG("\n"); - common_perf_print(*g_ctx, *g_smpl); - - // make sure all logs are flushed - LOG("Interrupted by user\n"); - common_log_pause(common_log_main()); - + printf("\n"); + llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } } } #endif +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { + llama_chat_msg new_msg{role, content}; + auto formatted = llama_chat_format_single( + model, g_params->chat_template, chat_msgs, new_msg, role == "user"); + chat_msgs.push_back({role, content}); + return formatted; +} + int main(int argc, char ** argv) { - common_params params; + gpt_params params; g_params = ¶ms; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); + llama_sampling_params & sparams = params.sparams; - auto & sparams = params.sampling; +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("main", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + // TODO: Dump params ? + //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); // save choice to use color for later // (note for later: this is a slightly awkward choice) @@ -102,207 +154,171 @@ int main(int argc, char ** argv) { atexit([]() { console::cleanup(); }); if (params.logits_all) { - LOG_ERR("************\n"); - LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - LOG_ERR("************\n\n"); + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); return 0; } if (params.embedding) { - LOG_ERR("************\n"); - LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - LOG_ERR("************\n\n"); + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } if (params.rope_freq_base != 0.0) { - LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_INF("%s: llama backend init\n", __func__); + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + + LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = nullptr; - llama_context * ctx = nullptr; - common_sampler * smpl = nullptr; - + llama_model * model; + llama_context * ctx; + llama_context * ctx_guidance = NULL; + std::vector chat_msgs; g_model = &model; g_ctx = &ctx; - g_smpl = &smpl; - - std::vector chat_msgs; // load the model and apply lora adapter, if any - LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = common_init_from_params(params); - - model = llama_init.model.get(); - ctx = llama_init.context.get(); + LOG("%s: load the model and apply lora adapter, if any\n", __func__); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (sparams.cfg_scale > 1.f) { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } if (model == NULL) { - LOG_ERR("%s: error: unable to load model\n", __func__); + LOG_TEE("%s: error: unable to load model\n", __func__); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - auto chat_templates = common_chat_templates_from_model(model, params.chat_template); - - LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); - - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); - auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); - auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); - - struct ggml_threadpool_params tpp_batch = - ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); - struct ggml_threadpool_params tpp = - ggml_threadpool_params_from_cpu_params(params.cpuparams); - - set_process_priority(params.cpuparams.priority); - - struct ggml_threadpool * threadpool_batch = NULL; - if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); - if (!threadpool_batch) { - LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); - return 1; - } - - // Start the non-batch threadpool in the paused state - tpp.paused = true; - } - - struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); - if (!threadpool) { - LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - return 1; - } - - llama_attach_threadpool(ctx, threadpool, threadpool_batch); - - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); - } - - // auto enable conversation mode if chat template is available - const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default; - if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { - if (has_chat_template) { - LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); - params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; - } else { - params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; - } - } - - // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning - if (params.conversation_mode && !has_chat_template) { - LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__); + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); } // print chat template example in conversation mode - if (params.conversation_mode) { + if (params.conversation) { if (params.enable_chat_template) { - LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str()); + LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); } else { - LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); + LOG_TEE("\n"); + LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); } std::string path_session = params.path_prompt_cache; std::vector session_tokens; if (!path_session.empty()) { - LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); if (!file_exists(path_session)) { - LOG_INF("%s: session file does not exist, will create.\n", __func__); + LOG_TEE("%s: session file does not exist, will create.\n", __func__); } else if (file_is_empty(path_session)) { - LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); + LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); } else { // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); + LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); return 1; } session_tokens.resize(n_token_count_out); - LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } - const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja; + const bool add_bos = llama_should_add_bos_token(model); if (!llama_model_has_encoder(model)) { - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + GGML_ASSERT(llama_add_eos_token(model) != 1); } - - LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); + LOG("add_bos: %d\n", add_bos); std::vector embd_inp; - auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { - common_chat_msg new_msg{role, content, {}}; - auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja); - chat_msgs.push_back({role, content, {}}); - LOG_DBG("formatted: '%s'\n", formatted.c_str()); - return formatted; - }; - { - auto prompt = (params.conversation_mode && params.enable_chat_template) - // format the system prompt in conversation mode (fallback to default if empty) - ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt) - // otherwise use the prompt as is + auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) + ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG_DBG("tokenize the prompt\n"); - embd_inp = common_tokenize(ctx, prompt, true, true); + LOG("tokenize the prompt\n"); + embd_inp = ::llama_tokenize(ctx, prompt, true, true); } else { - LOG_DBG("use session tokens\n"); + LOG("use session tokens\n"); embd_inp = session_tokens; } - LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); - LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); + LOG("prompt: \"%s\"\n", log_tostr(prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } // Should not run without any tokens if (embd_inp.empty()) { if (add_bos) { - embd_inp.push_back(llama_vocab_bos(vocab)); - LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); + embd_inp.push_back(llama_token_bos(model)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } else { - LOG_ERR("input is empty\n"); + LOG_TEE("error: input is empty\n"); return -1; } } // Tokenize negative prompt + std::vector guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) { + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); + + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); + LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); + LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); + + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); + LOG("guidance_offset: %s", log_tostr(guidance_offset)); + } + if ((int) embd_inp.size() > n_ctx - 4) { - LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -316,28 +332,29 @@ int main(int argc, char ** argv) { n_matching_session_tokens++; } if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_INF("%s: using full prompt from session file\n", __func__); + LOG_TEE("%s: using full prompt from session file\n", __func__); } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_INF("%s: session file has exact match for prompt!\n", __func__); + LOG_TEE("%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); } else { - LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); } // remove any "future" tokens that we might have inherited from the previous session llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); } - LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", - embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); + LOGLN( + "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", + log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); // if we will use the cache for the full prompt without reaching the end of the cache, force // reevaluation of the last token to recalculate the cached logits if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); + LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1); } @@ -349,7 +366,7 @@ int main(int argc, char ** argv) { params.n_keep += add_bos; // always keep the BOS token } - if (params.conversation_mode) { + if (params.conversation) { params.interactive_first = true; } @@ -359,20 +376,30 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (ctx_guidance) { + LOG_TEE("\n"); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); + LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int) guidance_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); + } } if (params.n_keep > add_bos) { - LOG_INF("%s: static prompt based on n_keep: '", __func__); + LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_CNT("'\n"); + LOG_TEE("'\n"); } - LOG_INF("\n"); + LOG_TEE("\n"); } // ctrl+C handling @@ -392,56 +419,47 @@ int main(int argc, char ** argv) { } if (params.interactive) { - LOG_INF("%s: interactive mode on.\n", __func__); + LOG_TEE("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { - LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); + LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { - auto tmp = common_tokenize(ctx, antiprompt, false, true); + auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } } if (params.input_prefix_bos) { - LOG_INF("Input prefix with BOS\n"); + LOG_TEE("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { - auto tmp = common_tokenize(ctx, params.input_prefix, true, true); + auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } if (!params.input_suffix.empty()) { - LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { - auto tmp = common_tokenize(ctx, params.input_suffix, false, true); + auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } } - - smpl = common_sampler_init(model, sparams); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); - return 1; - } - - LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); - LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); - - LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) @@ -455,9 +473,9 @@ int main(int argc, char ** argv) { GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); } - LOG_INF("\n"); + LOG_TEE("\n\n"); if (params.interactive) { const char * control_message; @@ -469,15 +487,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_INF("== Running in interactive mode. ==\n"); + LOG_TEE("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_INF( " - Press Ctrl+C to interject at any time.\n"); + LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_INF( "%s", control_message); - if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) { - LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n"); - } - LOG_INF("\n"); + LOG_TEE( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -491,6 +505,7 @@ int main(int argc, char ** argv) { int n_remain = params.n_predict; int n_consumed = 0; int n_session_consumed = 0; + int n_past_guidance = 0; std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; @@ -502,29 +517,34 @@ int main(int argc, char ** argv) { display = params.display_prompt; std::vector embd; + std::vector embd_guidance; - // single-token antiprompts - std::vector antiprompt_token; + // tokenized antiprompts + std::vector> antiprompt_ids; + antiprompt_ids.reserve(params.antiprompt.size()); for (const std::string & antiprompt : params.antiprompt) { - auto ids = ::common_tokenize(ctx, antiprompt, false, true); - if (ids.size() == 1) { - antiprompt_token.push_back(ids[0]); - } + antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); + } + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); } if (llama_model_has_encoder(model)) { int enc_input_size = embd_inp.size(); llama_token * enc_input_buf = embd_inp.data(); - if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) { - LOG_ERR("%s : failed to eval\n", __func__); + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); return 1; } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == LLAMA_TOKEN_NULL) { - decoder_start_token_id = llama_vocab_bos(vocab); + if (decoder_start_token_id == -1) { + decoder_start_token_id = llama_token_bos(model); } embd_inp.clear(); @@ -544,8 +564,9 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); + fflush(stdout); } if (ga_n == 1) { @@ -553,22 +574,16 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - - if (n_past + (int) embd.size() >= n_ctx) { - if (!params.ctx_shift){ - LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); - break; - } - + if (n_past + (int) embd.size() + std::max(0, guidance_offset) >= n_ctx) { if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; } const int n_left = n_past - params.n_keep; const int n_discard = n_left/2; - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); @@ -576,11 +591,15 @@ int main(int argc, char ** argv) { n_past -= n_discard; - LOG_DBG("after swap: n_past = %d\n", n_past); + if (ctx_guidance) { + n_past_guidance -= n_discard; + } - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); - LOG_DBG("clear session path\n"); + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + LOG("clear session path\n"); path_session.clear(); } } else { @@ -590,10 +609,10 @@ int main(int argc, char ** argv) { const int bd = (ga_w/ga_n)*(ga_n - 1); const int dd = (ga_w/ga_n) - ib*bd - ga_w; - LOG_DBG("\n"); - LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); - LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); - LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + LOG("\n"); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); @@ -603,7 +622,7 @@ int main(int argc, char ** argv) { ga_i += ga_w/ga_n; - LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); } } @@ -629,25 +648,65 @@ int main(int argc, char ** argv) { } } + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + if (ctx_guidance) { + int input_size = 0; + llama_token * input_buf = NULL; + + if (n_past_guidance < (int) guidance_inp.size()) { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end() + ); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + + LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); + } else { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { int n_eval = (int) embd.size() - i; if (n_eval > params.n_batch) { n_eval = params.n_batch; } - LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { - LOG_ERR("%s : failed to eval\n", __func__); + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG_DBG("n_past = %d\n", n_past); + LOG("n_past = %d\n", n_past); // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } @@ -658,6 +717,7 @@ int main(int argc, char ** argv) { } embd.clear(); + embd_guidance.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // optionally save the session on first sample (for faster prompt loading next time) @@ -665,14 +725,14 @@ int main(int argc, char ** argv) { need_to_save_session = false; llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - LOG_DBG("saved session to %s\n", path_session.c_str()); + LOG("saved session to %s\n", path_session.c_str()); } - const llama_token id = common_sampler_sample(smpl, ctx, -1); + const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - common_sampler_accept(smpl, id, /* accept_grammar= */ true); + llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); - // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); embd.push_back(id); @@ -682,16 +742,16 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG_DBG("n_remain: %d\n", n_remain); + LOG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -703,10 +763,10 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = common_token_to_piece(ctx, id, params.special); + const std::string token_str = llama_token_to_piece(ctx, id, params.special); // Console/Stream Output - LOG("%s", token_str.c_str()); + fprintf(stdout, "%s", token_str.c_str()); // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check @@ -718,6 +778,8 @@ int main(int argc, char ** argv) { output_tokens.push_back(id); output_ss << token_str; } + + fflush(stdout); } } @@ -732,7 +794,7 @@ int main(int argc, char ** argv) { // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { const int n_prev = 32; - const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); + const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -754,61 +816,64 @@ int main(int argc, char ** argv) { } // check for reverse prompt using special tokens - llama_token last_token = common_sampler_last(smpl); - if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) { - if (params.interactive) { - is_interacting = true; + llama_token last_token = llama_sampling_last(ctx_sampling); + for (std::vector ids : antiprompt_ids) { + if (ids.size() == 1 && last_token == ids[0]) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; } - is_antiprompt = true; } if (is_antiprompt) { - LOG_DBG("found antiprompt: %s\n", last_output.c_str()); + LOG("found antiprompt: %s\n", last_output.c_str()); } } // deal with end of generation tokens in interactive mode - if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { - LOG_DBG("found an EOG token\n"); + if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { + LOG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt - const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); is_antiprompt = true; } if (params.enable_chat_template) { - chat_add_and_format("assistant", assistant_ss.str()); + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); } is_interacting = true; - LOG("\n"); + printf("\n"); } } // if current token is not EOG, we add it to current assistant message - if (params.conversation_mode) { - const auto id = common_sampler_last(smpl); - assistant_ss << common_token_to_piece(ctx, id, false); + if (params.conversation) { + auto id = llama_sampling_last(ctx_sampling); + assistant_ss << llama_token_to_piece(ctx, id, false); } if (n_past > 0 && is_interacting) { - LOG_DBG("waiting for user input\n"); + LOG("waiting for user input\n"); - if (params.conversation_mode) { - LOG("\n> "); + if (params.conversation) { + printf("\n> "); } if (params.input_prefix_bos) { - LOG_DBG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_vocab_bos(vocab)); + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(model)); } std::string buffer; - if (!params.input_prefix.empty() && !params.conversation_mode) { - LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - LOG("%s", params.input_prefix.c_str()); + if (!params.input_prefix.empty() && !params.conversation) { + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + printf("%s", params.input_prefix.c_str()); } // color user input only @@ -830,12 +895,12 @@ int main(int argc, char ** argv) { // Entering a empty line lets the user pass control back if (buffer.length() > 1) { // append input suffix if any - if (!params.input_suffix.empty() && !params.conversation_mode) { - LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - LOG("%s", params.input_suffix.c_str()); + if (!params.input_suffix.empty() && !params.conversation) { + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + printf("%s", params.input_suffix.c_str()); } - LOG_DBG("buffer: '%s'\n", buffer.c_str()); + LOG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); @@ -843,21 +908,21 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } - bool format_chat = params.conversation_mode && params.enable_chat_template; + bool format_chat = params.conversation && params.enable_chat_template; std::string user_inp = format_chat - ? chat_add_and_format("user", std::move(buffer)) + ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) - const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); - const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); - LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { - llama_token eot = llama_vocab_eot(vocab); - embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot); + llama_token eot = llama_token_eot(model); + embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); need_insert_eot = false; } @@ -868,16 +933,16 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << common_token_to_piece(ctx, token); + output_ss << llama_token_to_piece(ctx, token); } // reset assistant message assistant_ss.str(""); n_remain -= line_inp.size(); - LOG_DBG("n_remain: %d\n", n_remain); + LOG("n_remain: %d\n", n_remain); } else { - LOG_DBG("empty line, passing control back\n"); + LOG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -885,15 +950,15 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - common_sampler_reset(smpl); + llama_sampling_reset(ctx_sampling); } is_interacting = false; } } // end of generation - if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) { - LOG(" [end of text]\n"); + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { + LOG_TEE(" [end of text]\n"); break; } @@ -906,19 +971,23 @@ int main(int argc, char ** argv) { } if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { - LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - LOG("\n\n"); - common_perf_print(ctx, smpl); + llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - common_sampler_free(smpl); + if (ctx_guidance) { llama_free(ctx_guidance); } + llama_free(ctx); + llama_free_model(model); + llama_sampling_free(ctx_sampling); llama_backend_free(); - ggml_threadpool_free_fn(threadpool); - ggml_threadpool_free_fn(threadpool_batch); +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n"); +#endif // LOG_DISABLE_LOGS return 0; } diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt index 847e916de..c13557bac 100644 --- a/examples/parallel/CMakeLists.txt +++ b/examples/parallel/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-parallel) add_executable(${TARGET} parallel.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7ef43d5e1..7faeaec97 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -1,10 +1,7 @@ // A basic application simulating a server with multiple clients. // The clients submit requests to the server and they are processed in parallel. -#include "arg.h" #include "common.h" -#include "sampling.h" -#include "log.h" #include "llama.h" #include @@ -53,8 +50,8 @@ static std::vector k_prompts = { struct client { ~client() { - if (smpl) { - common_sampler_free(smpl); + if (ctx_sampling) { + llama_sampling_free(ctx_sampling); } } @@ -75,7 +72,7 @@ struct client { std::string prompt; std::string response; - struct common_sampler * smpl = nullptr; + struct llama_sampling_context * ctx_sampling = nullptr; }; static void print_date_time() { @@ -84,9 +81,7 @@ static void print_date_time() { char buffer[80]; strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time); - LOG_INF("\n"); - LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer); - LOG_INF("\n"); + printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer); } // Define a split string function to ... @@ -103,14 +98,13 @@ static std::vector split_string(const std::string& input, char deli int main(int argc, char ** argv) { srand(1234); - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; @@ -125,36 +119,41 @@ int main(int argc, char ** argv) { const bool dump_kv_cache = params.dump_kv_cache; +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("parallel", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); + llama_model * model = NULL; + llama_context * ctx = NULL; + // load the target model - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - - const llama_vocab * vocab = llama_model_get_vocab(model); + std::tie(model, ctx) = llama_init_from_gpt_params(params); // load the prompts from an external file if there are any if (params.prompt.empty()) { - LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); + printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); } else { // Output each line of the input params.prompts vector and copy to k_prompts int index = 0; - LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); + printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); std::vector prompts = split_string(params.prompt, '\n'); for (const auto& prompt : prompts) { k_prompts.resize(index + 1); k_prompts[index] = prompt; index++; - LOG_INF("%3d prompt: %s\n", index, prompt.c_str()); + printf("%3d prompt: %s\n", index, prompt.c_str()); } } - LOG_INF("\n\n"); + fprintf(stderr, "\n\n"); + fflush(stderr); const int n_ctx = llama_n_ctx(ctx); @@ -162,11 +161,11 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.smpl = common_sampler_init(model, params.sampling); + client.ctx_sampling = llama_sampling_init(params.sparams); } std::vector tokens_system; - tokens_system = common_tokenize(ctx, k_system, true); + tokens_system = ::llama_tokenize(ctx, k_system, true); const int32_t n_tokens_system = tokens_system.size(); llama_seq_id g_seq_id = 0; @@ -183,19 +182,19 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - LOG_INF("%s: Simulating parallel requests from clients:\n", __func__); - LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); - LOG_INF("\n"); + LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); + LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_TEE("\n"); { - LOG_INF("%s: Evaluating the system prompt ...\n", __func__); + LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); for (int32_t i = 0; i < n_tokens_system; ++i) { - common_batch_add(batch, tokens_system[i], i, { 0 }, false); + llama_batch_add(batch, tokens_system[i], i, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } @@ -204,18 +203,18 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } - LOG_INF("\n"); + LOG_TEE("\n"); } - LOG_INF("Processing requests ...\n\n"); + LOG_TEE("Processing requests ...\n\n"); while (true) { if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - common_kv_cache_dump_view_seqs(kvc_view, 40); + llama_kv_cache_dump_view_seqs(kvc_view, 40); } - common_batch_clear(batch); + llama_batch_clear(batch); // decode any currently ongoing sequences for (auto & client : clients) { @@ -225,7 +224,7 @@ int main(int argc, char ** argv) { client.i_batch = batch.n_tokens; - common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); + llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); client.n_decoded += 1; } @@ -238,7 +237,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } - LOG_INF("%s: clearing the KV cache\n", __func__); + LOG_TEE("%s: clearing the KV cache\n", __func__); } // insert new sequences for decoding @@ -254,14 +253,14 @@ int main(int argc, char ** argv) { client.prompt = client.input + "\nAssistant:"; client.response = ""; - common_sampler_reset(client.smpl); + llama_sampling_reset(client.ctx_sampling); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; - tokens_prompt = common_tokenize(ctx, client.prompt, false); + tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { - common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); + llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); } // extract the logits only for the last token @@ -273,7 +272,7 @@ int main(int argc, char ** argv) { client.n_decoded = 0; client.i_batch = batch.n_tokens - 1; - LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); + LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); g_seq_id += 1; @@ -310,17 +309,18 @@ int main(int argc, char ** argv) { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); return 1; } - LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); n_cache_miss += 1; @@ -331,7 +331,7 @@ int main(int argc, char ** argv) { continue; } - LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens); + LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens); for (auto & client : clients) { if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { @@ -341,9 +341,9 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i); + const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i); - common_sampler_accept(client.smpl, id, true); + llama_sampling_accept(client.ctx_sampling, ctx, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients @@ -351,7 +351,7 @@ int main(int argc, char ** argv) { client.t_start_gen = ggml_time_us(); } - const std::string token_str = common_token_to_piece(ctx, id); + const std::string token_str = llama_token_to_piece(ctx, id); client.response += token_str; client.sampled = id; @@ -360,7 +360,7 @@ int main(int argc, char ** argv) { // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); if (client.n_decoded > 2 && - (llama_vocab_is_eog(vocab, id) || + (llama_token_is_eog(model, id) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || client.response.find("User:") != std::string::npos || client.response.find('\n') != std::string::npos)) { @@ -371,12 +371,12 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); + llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); - LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n", + LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n", client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded, (t_main_end - client.t_start_prompt) / 1e6, (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, @@ -399,28 +399,30 @@ int main(int argc, char ** argv) { print_date_time(); - LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); if (params.prompt_file.empty()) { params.prompt_file = "used built-in defaults"; } - LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); - LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); + LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); + LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); - LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); - LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); - LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); - LOG_INF("Cache misses: %6d\n", n_cache_miss); + LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); + LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); + LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); + LOG_TEE("Cache misses: %6d\n", n_cache_miss); - LOG_INF("\n"); + LOG_TEE("\n"); - // TODO: print sampling/grammar timings for all clients - llama_perf_context_print(ctx); + llama_print_timings(ctx); llama_batch_free(batch); + llama_free(ctx); + llama_free_model(model); + llama_backend_free(); - LOG("\n\n"); + fprintf(stderr, "\n\n"); return 0; } diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt index 9bc5110c2..dc467a5d3 100644 --- a/examples/passkey/CMakeLists.txt +++ b/examples/passkey/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-passkey) add_executable(${TARGET} passkey.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5953928d4..d03215cd1 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -1,6 +1,4 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include @@ -8,24 +6,27 @@ #include #include -static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); - LOG("\n"); +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); + LOG_TEE("\n"); } int main(int argc, char ** argv) { - common_params params; + gpt_params params; params.n_junk = 250; params.n_keep = 32; params.i_pos = -1; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - common_init(); + srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed); int n_junk = params.n_junk; int n_keep = params.n_keep; @@ -61,43 +62,36 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_to_llama(params); + llama_model_params model_params = llama_model_params_from_gpt_params(params); - llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - LOG_ERR("%s: unable to load model\n" , __func__); + fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - // initialize the context - llama_context_params ctx_params = common_context_params_to_llama(params); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep; + ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); - llama_context * ctx = llama_init_from_model(model, ctx_params); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (ctx == NULL) { - LOG_ERR("%s: failed to create the llama_context\n" , __func__); + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return 1; } - auto sparams = llama_sampler_chain_default_params(); - - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); - // tokenize the prompt std::vector tokens_list; - tokens_list = common_tokenize(ctx, params.prompt, true); + tokens_list = ::llama_tokenize(ctx, params.prompt, true); // tokenize the prefix and use it as a sink - const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size(); + const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); const int n_tokens_all = tokens_list.size(); @@ -112,14 +106,14 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token - LOG_INF("\n"); - LOG_INF("prefix tokens: %d\n", n_tokens_prefix); - LOG_INF("prompt tokens: %d\n", n_tokens_all); - //LOG_INF("prompt: %s\n", params.prompt.c_str()); + LOG_TEE("\n"); + LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); + LOG_TEE("prompt tokens: %d\n", n_tokens_all); + //LOG_TEE("prompt: %s\n", params.prompt.c_str()); llama_batch batch = llama_batch_init(params.n_batch, 0, 1); @@ -139,10 +133,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } - common_batch_clear(batch); + llama_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -150,11 +144,11 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_INF("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } - LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); if (i + n_batch >= n_tokens_all) { break; @@ -164,7 +158,7 @@ int main(int argc, char ** argv) { for (int i = n_ctx; i < n_tokens_all; i += n_batch) { const int n_discard = n_batch; - LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); + LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -173,10 +167,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; - common_batch_clear(batch); + llama_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -184,18 +178,18 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } - LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); } { const int n_discard = n_past - n_ctx + n_predict; if (n_discard > 0) { - LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); + LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -206,69 +200,81 @@ int main(int argc, char ** argv) { } } - LOG_INF("\n"); - LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); - LOG_INF("\n"); + LOG_TEE("\n"); + LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); + LOG_TEE("\n"); // main loop int n_cur = n_tokens_all; int n_decode = 0; - LOG_INF("%s", prompt_suffix.c_str()); + LOG_TEE("%s", prompt_suffix.c_str()); + fflush(stdout); const auto t_main_start = ggml_time_us(); while (n_cur <= n_len) { // sample the next token { - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); + auto n_vocab = llama_n_vocab(model); + auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // sample the most likely token + const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of generation? - if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) { - LOG("\n"); + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { + LOG_TEE("\n"); break; } - LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); + LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); n_decode += 1; // prepare the next batch - common_batch_clear(batch); + llama_batch_clear(batch); // push this new token for next evaluation - common_batch_add(batch, new_token_id, n_past++, { 0 }, true); + llama_batch_add(batch, new_token_id, n_past++, { 0 }, true); } n_cur += 1; // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG("\n"); + LOG_TEE("\n"); const auto t_main_end = ggml_time_us(); - LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG("\n"); - llama_perf_context_print(ctx); + llama_print_timings(ctx); - LOG("\n"); - - llama_sampler_free(smpl); + fprintf(stderr, "\n"); llama_batch_free(batch); llama_free(ctx); - llama_model_free(model); + llama_free_model(model); llama_backend_free(); diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 3e6864093..be0f2fd02 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9bf6c5743..dbe445391 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,21 +1,18 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" -#include -#include -#include #include #include #include #include -#include -#include -#include #include #include +#include +#include #include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -34,6 +31,55 @@ struct results_log_softmax { float prob; }; +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const struct results_perplexity & results +) { + if (params.logdir.empty()) { + return; + } + + if (params.hellaswag) { + fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); + return; + } + + const std::string timestamp = string_get_sortable_timestamp(); + + const bool success = fs_create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Perplexity Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + yaml_dump_vector_float(logfile, "logits", results.logits); + fprintf(logfile, "ppl_value: %f\n", results.ppl_value); + yaml_dump_vector_float(logfile, "probs", results.probs); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + static std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; @@ -120,7 +166,7 @@ static void process_logits( break; } lock.unlock(); - const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); const double v = -results.log_softmax; local_nll += v; local_nll2 += v*v; @@ -154,7 +200,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, break; } lock.unlock(); - const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]); + const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]); local_nll += v; local_nll2 += v*v; } @@ -232,9 +278,7 @@ static std::pair log_softmax(int n_vocab, const float * logits, c kld.sum_kld += sum; kld.sum_kld2 += sum*sum; ++kld.count; - if (imax == imax_base) { - ++kld.n_same_top; - } + if (imax == imax_base) ++kld.n_same_top; const float p_base = expf(-nll_base); const float p = expf(-nll); @@ -276,7 +320,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens break; } lock.unlock(); - std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + std::pair v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); kld_values[i] = (float)v.first; p_diff_values[i] = v.second; } @@ -290,28 +334,25 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens } } -static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) { +static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); - const bool add_bos = llama_vocab_get_add_bos(vocab); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - LOG_INF("%s: tokenizing the input ..\n", __func__); - - std::vector tokens = common_tokenize(ctx, params.prompt, true); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); if (int(tokens.size()) < 2*n_ctx) { - LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, n_ctx); - LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -322,16 +363,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params prob_history.resize(tokens.size()); if (params.ppl_stride <= 0) { - LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); + fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } const int calc_chunk = n_ctx; - LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); + fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); if (int(tokens.size()) <= calc_chunk) { - LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, + fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, tokens.size(), n_ctx, params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } @@ -339,21 +380,20 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_tokens(vocab); - int count = 0; double nll = 0.0; - LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); for (int i = 0; i < n_chunk; ++i) { const int start = i * params.ppl_stride; const int end = start + calc_chunk; const int num_batches = (calc_chunk + n_batch - 1) / n_batch; - //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); + //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); std::vector logits; @@ -362,21 +402,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params // clear the KV cache llama_kv_cache_clear(ctx); - llama_batch batch = llama_batch_init(n_batch, 0, 1); - for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); - common_batch_clear(batch); - for (int i = 0; i < batch_size; i++) { - common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); - } - - //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); - if (llama_decode(ctx, batch)) { - //LOG_ERR("%s : failed to eval\n", __func__); - llama_batch_free(batch); + //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + // TODO: use llama_batch.logits instead of relying on logits_all == true + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + //fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -385,38 +418,37 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_vocab_bos(vocab); + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); + const auto batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); if (j == 0) { tokens[batch_start] = token_org; } } - llama_batch_free(batch); - const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - LOG("%d hours ", total_seconds / (60*60)); + fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - LOG("%.2f minutes\n", total_seconds / 60.0); + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); + //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { + // Calculate probability of next token, given the previous ones. const std::vector tok_logits( - logits.begin() + size_t(j + 0) * n_vocab, - logits.begin() + size_t(j + 1) * n_vocab); + logits.begin() + (j + 0) * n_vocab, + logits.begin() + (j + 1) * n_vocab); const float prob = softmax(tok_logits)[tokens[start + j + 1]]; logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; @@ -427,17 +459,18 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params } // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); } else { - LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); + printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); } + fflush(stdout); } - LOG("\n"); + printf("\n"); return {tokens, std::exp(nll / count), logit_history, prob_history}; } -static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) { +static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -447,36 +480,33 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const bool add_bos = llama_vocab_get_add_bos(vocab); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); std::ofstream logits_stream; if (!params.logits_file.empty()) { logits_stream.open(params.logits_file.c_str(), std::ios::binary); if (!logits_stream.is_open()) { - LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); + fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); return {}; } - LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); + fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); logits_stream.write("_logits_", 8); logits_stream.write(reinterpret_cast(&n_ctx), sizeof(n_ctx)); } auto tim1 = std::chrono::high_resolution_clock::now(); - LOG_INF("%s: tokenizing the input ..\n", __func__); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = common_tokenize(ctx, params.prompt, true); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); - LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (int(tokens.size()) < 2*n_ctx) { - LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, n_ctx); - LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -489,10 +519,9 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_tokens(vocab); - int count = 0; double nll = 0.0; double nll2 = 0.0; @@ -507,10 +536,10 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & std::vector logits; if (num_batches > 1) { - logits.reserve(size_t(n_ctx) * n_vocab); + logits.reserve((size_t)n_ctx * n_vocab); } - LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); + fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -563,7 +592,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[seq_start] = llama_vocab_bos(vocab); + tokens[seq_start] = llama_token_bos(llama_get_model(ctx)); } for (int k = 0; k < batch_size; ++k) { @@ -583,13 +612,13 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & } if (llama_decode(ctx, batch)) { - LOG_INF("%s : failed to eval\n", __func__); + fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } if (num_batches > 1 && n_outputs > 0) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); } } @@ -598,13 +627,13 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & llama_synchronize(ctx); const auto t_end = std::chrono::high_resolution_clock::now(); const float t_total = std::chrono::duration(t_end - t_start).count(); - LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total*n_chunk/n_seq); if (total_seconds >= 60*60) { - LOG("%d hours ", total_seconds / (60*60)); + fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - LOG("%.2f minutes\n", total_seconds / 60.0); + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } for (int seq = 0; seq < n_seq_batch; seq++) { @@ -626,20 +655,19 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); + printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); } else { double av = nll/count; double av2 = nll2/count - av*av; - if (av2 > 0) { - av2 = sqrt(av2/(count-1)); - } - LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); + if (av2 > 0) av2 = sqrt(av2/(count-1)); + printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } } + fflush(stdout); logits.clear(); } - LOG("\n"); + printf("\n"); nll2 /= count; nll /= count; @@ -647,9 +675,9 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { - LOG_ERR("Unexpected negative standard deviation of log(prob)\n"); + printf("Unexpected negative standard deviation of log(prob)\n"); } llama_batch_free(batch); @@ -657,10 +685,10 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & return {tokens, ppl, logit_history, prob_history}; } -static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int n_batch, int n_vocab) { +static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int32_t n_batch, int32_t n_vocab) { int prev_outputs = 0; - for (int i = 0; i < (int) batch.n_tokens; i += n_batch) { - const int n_tokens = std::min(n_batch, batch.n_tokens - i); + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); llama_batch batch_view = { n_tokens, @@ -670,11 +698,12 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -683,7 +712,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< n_outputs += batch_view.logits[i] != 0; } - memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float)); + memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); prev_outputs += n_outputs; } @@ -698,9 +727,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto if (eval_results.size() != eval_pairs.size()) { eval_results.resize(eval_pairs.size()); } - if (eval_pairs.empty()) { - return; - } + if (eval_pairs.empty()) return; size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size()); @@ -708,13 +735,11 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { float local_logprobs[K_TOKEN_CHUNK]; while (true) { - const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); - if (first >= eval_results.size()) { - break; - } - const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); + size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); + if (first >= eval_results.size()) break; + size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); for (size_t i = first; i < last; ++i) { - const auto * logits = batch_logits + eval_pairs[i].first * n_vocab; + auto logits = batch_logits + eval_pairs[i].first * n_vocab; float max_logit = logits[0]; for (int j = 1; j < n_vocab; ++j) { max_logit = std::max(max_logit, logits[j]); @@ -737,10 +762,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto } } -static void hellaswag_score(llama_context * ctx, const common_params & params) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - +static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -767,15 +789,15 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } if (prompt_lines.size() % 6 != 0) { - LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__); + fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__); return; } size_t hs_task_count = prompt_lines.size()/6; - LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); + fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); - const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM; - LOG_INF("================================= is_spm = %d\n", is_spm); + const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; + fprintf(stderr, "================================= is_spm = %d\n", is_spm); // The tasks should be randomized so the score stabilizes quickly. bool randomize_tasks = true; @@ -802,7 +824,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { std::vector seq_tokens[4]; }; - LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); + fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); // Select and read data from prompt lines std::vector hs_data(hs_task_count); @@ -821,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j = 0; j < 4; j++) { hs_cur.ending[j] = prompt_lines[idx*6+2+j]; - hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); + hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); } // determine the common prefix of the endings @@ -848,17 +870,16 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } } - LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__); + fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__); - LOG("\ntask\tacc_norm\n"); + printf("\ntask\tacc_norm\n"); double acc = 0.0f; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_tokens(vocab); - const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -866,7 +887,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { std::vector tok_logits(n_vocab); // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(size_t(n_ctx)*n_vocab); + std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; std::vector eval_results; @@ -878,7 +899,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - common_batch_clear(batch); + llama_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -894,7 +915,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); + llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -904,7 +925,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { // TODO: don't evaluate the last token of each sequence for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -919,7 +940,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } if (i0 == i1) { - LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); + fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -927,7 +948,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + fprintf(stderr, "%s: llama_decode() failed\n", __func__); return; } @@ -953,7 +974,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { auto & hs_cur = hs_data[i]; // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float)); + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -977,7 +998,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } } - //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); + //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); // If the gold ending got the maximum logprobe add one accuracy point if (ending_logprob_max_idx == hs_cur.gold_ending_idx) { @@ -985,7 +1006,8 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } // Print the accumulated accuracy mean x 100 - LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); + printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); + fflush(stdout); } i0 = i1 - 1; @@ -993,7 +1015,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { llama_batch_free(batch); - LOG("\n"); + printf("\n"); } struct winogrande_entry { @@ -1037,7 +1059,7 @@ static std::vector load_winogrande_from_csv(const std::string } } if (ipos != 4) { - LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); + printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); continue; } auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) @@ -1051,13 +1073,13 @@ static std::vector load_winogrande_from_csv(const std::string if (sentence[where] == '_') break; } if (where == int(sentence.size())) { - LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str()); + printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); continue; } std::istringstream stream(answer.c_str()); int i_answer; stream >> i_answer; if (stream.fail() || i_answer < 1 || i_answer > 2) { - LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); + printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); continue; } result.emplace_back(); @@ -1080,22 +1102,20 @@ static std::vector load_winogrande_from_csv(const std::string * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 * */ -static void winogrande_score(llama_context * ctx, const common_params & params) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); +static void winogrande_score(llama_context * ctx, const gpt_params & params) { constexpr int k_min_trailing_ctx = 3; auto data = load_winogrande_from_csv(params.prompt); if (data.empty()) { - LOG_ERR("%s: no tasks\n", __func__); + fprintf(stderr, "%s: no tasks\n", __func__); return; } - LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size()); + fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { - LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); + fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); std::mt19937 rng(1); std::vector aux(data.size()); for (int i = 0; i < int(data.size()); ++i) { @@ -1113,11 +1133,11 @@ static void winogrande_score(llama_context * ctx, const common_params & params) data = std::move(selected); } - LOG_INF("%s : tokenizing selected tasks\n", __func__); + fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); for (auto & task : data) { - task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true); - task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true); + task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); + task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); task.common_prefix = 0; for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { @@ -1132,17 +1152,16 @@ static void winogrande_score(llama_context * ctx, const common_params & params) task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[1].size() - task.common_prefix; - task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size(); - task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size(); + task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); + task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); } - LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__); + fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_tokens(vocab); - const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1150,7 +1169,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) std::vector tok_logits(n_vocab); // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(size_t(n_ctx)*n_vocab); + std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; std::vector eval_results; @@ -1165,7 +1184,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) size_t i1 = i0; size_t i_logits = 0; - common_batch_clear(batch); + llama_batch_clear(batch); while (n_cur + (int) data[i1].required_tokens <= n_ctx) { int n_logits = 0; @@ -1175,7 +1194,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) } for (size_t i = 0; i < data[i1].common_prefix; ++i) { - common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); + llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); } batch.logits[batch.n_tokens - 1] = true; n_logits += 1; @@ -1183,7 +1202,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) for (int s = 0; s < 2; ++s) { // TODO: end before the last token, no need to predict past the end of the sequences for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { - common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); + llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); n_logits += 1; } } @@ -1198,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) } if (i0 == i1) { - LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); + fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -1206,7 +1225,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + fprintf(stderr, "%s: llama_decode() failed\n", __func__); return; } @@ -1266,20 +1285,20 @@ static void winogrande_score(llama_context * ctx, const common_params & params) ++n_done; // print the accumulated accuracy mean x 100 - LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); + printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); + fflush(stdout); } i0 = i1 - 1; } - LOG("\n"); + printf("\n"); if (n_done < 100) return; const float p = 1.f*n_correct/n_done; const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1)); - - LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); + printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); } static bool deserialize_string(std::istream & in, std::string & str) { @@ -1328,7 +1347,7 @@ struct multiple_choice_task { static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) { if (task.question.empty() || task.mc1.answers.empty()) { if (log_error) { - LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__); + printf("%s: found bad task with empty question and/or answers\n", __func__); } return false; } @@ -1336,11 +1355,11 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic for (auto& answer : task.mc1.answers) { if (answer.empty()) { if (log_error) { - LOG_ERR("%s: found empty answer\n", __func__); + printf("%s: found empty answer\n", __func__); } return false; } - task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true)); + task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); } auto min_len = task.seq_tokens.front().size(); for (auto& seq : task.seq_tokens) { @@ -1384,22 +1403,20 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic // git@hf.co:datasets/Stevross/mmlu // https://huggingface.co/datasets/truthful_qa // -static void multiple_choice_score(llama_context * ctx, const common_params & params) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); +static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { std::istringstream strstream(params.prompt); uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); if (strstream.fail() || n_task == 0) { - LOG_ERR("%s: no tasks\n", __func__); + printf("%s: no tasks\n", __func__); return; } - LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task); + printf("%s: there are %u tasks in prompt\n", __func__, n_task); std::vector task_pos(n_task); strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); if (strstream.fail()) { - LOG_ERR("%s: failed to read task positions from prompt\n", __func__); + printf("%s: failed to read task positions from prompt\n", __func__); return; } @@ -1407,21 +1424,21 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { // Use all tasks tasks.resize(n_task); - LOG_INF("%s: reading tasks", __func__); + printf("%s: reading tasks", __func__); int n_dot = std::max((int) n_task/100, 1); int i = 0; for (auto& task : tasks) { ++i; if (!task.deserialize(strstream)) { - LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task); + printf("%s: failed to read task %d of %u\n", __func__, i, n_task); return; } - if (i%n_dot == 0) LOG("."); + if (i%n_dot == 0) printf("."); } - LOG("done\n"); + printf("done\n"); } else { - LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); + printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); std::mt19937 rng(1); std::vector aux(n_task); for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; @@ -1434,16 +1451,18 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par aux.pop_back(); strstream.seekg(task_pos[idx], std::ios::beg); if (!task.deserialize(strstream)) { - LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); + printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); return; } } n_task = params.multiple_choice_tasks; } - LOG_INF("%s: preparing task data", __func__); + printf("%s: preparing task data", __func__); + fflush(stdout); if (n_task > 500) { - LOG("..."); + printf("..."); + fflush(stdout); std::atomic counter(0); std::atomic n_bad(0); auto prepare = [&counter, &n_bad, &tasks, ctx] () { @@ -1467,10 +1486,11 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par for (auto& w : workers) w = std::thread(prepare); prepare(); for (auto& w : workers) w.join(); - LOG("done\n"); + printf("done\n"); + fflush(stdout); int nbad = n_bad; if (nbad > 0) { - LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad); + printf("%s: found %d malformed tasks\n", __func__, nbad); return; } } else { @@ -1482,28 +1502,28 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par return; } if (i_task%n_dot == 0) { - LOG("."); + printf("."); + fflush(stdout); } } - LOG("done\n"); + printf("done\n"); } - LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); + printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); - LOG("\ntask\tacc_norm\n"); + printf("\ntask\tacc_norm\n"); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_tokens(vocab); - const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); std::vector tok_logits(n_vocab); - std::vector batch_logits(size_t(n_ctx)*n_vocab); + std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; std::vector eval_results; @@ -1520,7 +1540,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - common_batch_clear(batch); + llama_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -1543,7 +1563,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par for (size_t i = 0; i < cur_task.common_prefix; ++i) { //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); - common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); + llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -1553,7 +1573,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par // TODO: don't evaluate the last token of each sequence for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -1570,7 +1590,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par } if (i0 == i1) { - LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); + fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -1578,7 +1598,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + fprintf(stderr, "%s: llama_decode() failed\n", __func__); return; } @@ -1602,16 +1622,16 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par // compute the logprobs for each ending of the decoded tasks for (size_t i = i0; i < i1; ++i) { auto & cur_task = tasks[i]; - //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); + //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { // if (cur_task.mc1.labels[j] == 1) { - // LOG("%d", j+1); + // printf("%d", j+1); // } //} - //LOG("\n common_prefix: %zu\n", cur_task.common_prefix); + //printf("\n common_prefix: %zu\n", cur_task.common_prefix); // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float)); + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1620,13 +1640,13 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par size_t count = 1; float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - //LOG(" %zu %g\n", ir, eval_results[ir]); + //printf(" %zu %g\n", ir, eval_results[ir]); ++count; log_prob += eval_results[ir++]; } cur_task.log_probs[s] = log_prob / count; - //LOG(" Final: %g\n", log_prob / count); - //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); + //printf(" Final: %g\n", log_prob / count); + //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); } // Find the ending with maximum logprob @@ -1646,7 +1666,8 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par ++n_done; // Print the accumulated accuracy mean x 100 - LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); + printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); + fflush(stdout); } i0 = i1 - 1; @@ -1658,33 +1679,29 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par float p = 1.f*n_correct/n_done; float sigma = sqrt(p*(1-p)/(n_done-1)); - LOG("\n"); - LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); p = 1.f*n_done/n_tot_answers; sigma = sqrt(p*(1-p)/(n_done-1)); - LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); - LOG_INF("\n"); + printf("\n"); } -static void kl_divergence(llama_context * ctx, const common_params & params) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - +static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (params.logits_file.empty()) { - LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); + fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; } std::ifstream in(params.logits_file.c_str(), std::ios::binary); if (!in) { - LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str()); + fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str()); return; } { char check[9]; check[8] = 0; in.read(check, 8); if (in.fail() || strncmp("_logits_", check, 8) != 0) { - LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); + fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); return; } } @@ -1692,40 +1709,39 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { uint32_t n_ctx; in.read((char *)&n_ctx, sizeof(n_ctx)); if (n_ctx > llama_n_ctx(ctx)) { - LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", + fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); } - int n_vocab; - int n_chunk; + int n_vocab, n_chunk; in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_chunk, sizeof(n_chunk)); if (in.fail()) { - LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); + fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); return; } - if (n_vocab != llama_vocab_n_tokens(vocab)) { - LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab)); + if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { + fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); } - std::vector tokens(size_t(n_ctx) * n_chunk); + std::vector tokens(n_ctx * n_chunk); if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { - LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); + fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); return; } const int n_batch = params.n_batch; const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int nv = 2*((n_vocab + 1)/2) + 4; - const bool add_bos = llama_vocab_get_add_bos(vocab); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector logits; if (num_batches > 1) { - logits.reserve(size_t(n_ctx) * n_vocab); + logits.reserve(n_ctx * n_vocab); } std::vector workers(std::thread::hardware_concurrency() - 1); @@ -1759,15 +1775,13 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { - LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i); + fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i); return; } // clear the KV cache llama_kv_cache_clear(ctx); - llama_batch batch = llama_batch_init(n_batch, 0, 1); - for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); @@ -1777,17 +1791,12 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_vocab_bos(vocab); + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - common_batch_clear(batch); - for (int i = 0; i < batch_size; i++) { - common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); - } - - if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval\n", __func__); - llama_batch_free(batch); + // TODO: use llama_batch.logits instead of relying on logits_all == true + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); return; } @@ -1796,105 +1805,105 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { if (num_batches > 1) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } } - llama_batch_free(batch); - const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - LOG("%d hours ", total_seconds / (60*60)); + fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - LOG("%.2f minutes\n", total_seconds / 60.0); + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + + printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); } - LOG("\n"); - LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); p_diff_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first; - LOG("%4d", i+1); + printf("%4d", i+1); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); + printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); + printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); + printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); double p_top_val = 1.*kld.n_same_top/kld.count; double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); - LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); + printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); - LOG("\n"); + printf("\n"); + + fflush(stdout); logits.clear(); } - LOG("\n"); + printf("\n"); if (kld.count < 100) return; // we do not wish to do statistics on so few values std::sort(kld_values.begin(), kld_values.end()); std::sort(p_diff_values.begin(), p_diff_values.end()); - LOG("====== Perplexity statistics ======\n"); + printf("====== Perplexity statistics ======\n"); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); + printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); const double ppl_base_val = exp(log_ppl_base.first); const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 ) - LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); + printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); - // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); + // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second); - LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); + printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); + printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); const double ppl_ratio_val = exp(log_ppl_ratio_val); const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 ) - LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); + printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov; const double ppl_diff_val = ppl_val - ppl_base_val; const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov); - LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); + printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); - LOG("\n"); + printf("\n"); - LOG("====== KL divergence statistics ======\n"); + printf("====== KL divergence statistics ======\n"); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); + printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) : kld_values[kld_values.size()/2]; @@ -1906,68 +1915,67 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)]; }; - LOG("Maximum KLD: %10.6f\n", kld_values.back()); - LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); - LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - LOG("Median KLD: %10.6f\n", kld_median); - LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); - LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); - LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); - LOG("Minimum KLD: %10.6f\n", kld_values.front()); + printf("Maximum KLD: %10.6f\n", kld_values.back()); + printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); + printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + printf("Median KLD: %10.6f\n", kld_median); + printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); + printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); + printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); + printf("Minimum KLD: %10.6f\n", kld_values.front()); - LOG("\n"); + printf("\n"); - LOG("====== Token probability statistics ======\n"); + printf("====== Token probability statistics ======\n"); auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count); - LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); + printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1]) : p_diff_values[p_diff_values.size()/2]; - LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); - LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); - LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); - LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); - LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); - LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); - LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); - LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); - LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); - LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); - LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); - LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); - LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); + printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); + printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); + printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); + printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); + printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); + printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); + printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); + printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); + printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); + printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); + printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); + printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); + printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); - // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); + // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); const double same_top_p = 1.0*kld.n_same_top/kld.count; - LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); + printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); + } int main(int argc, char ** argv) { - common_params params; + gpt_params params; params.n_ctx = 512; params.logits_all = true; - params.escape = false; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); - const int32_t n_ctx = params.n_ctx; if (n_ctx <= 0) { - LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__); + fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); return 1; } @@ -1992,36 +2000,45 @@ int main(int argc, char ** argv) { } if (params.ppl_stride > 0) { - LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", + fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", params.n_ctx, params.n_ctx + params.ppl_stride/2); params.n_ctx += params.ppl_stride/2; } + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + llama_backend_init(); llama_numa_init(params.numa); + llama_model * model; + llama_context * ctx; + // load the model and apply lora adapter, if any - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { - LOG_ERR("%s: unable to load model\n", __func__); + fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { - LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } struct results_perplexity results; @@ -2037,8 +2054,11 @@ int main(int argc, char ** argv) { results = perplexity(ctx, params, n_ctx); } - LOG("\n"); - llama_perf_context_print(ctx); + llama_print_timings(ctx); + write_logfile(ctx, params, model, results); + + llama_free(ctx); + llama_free_model(model); llama_backend_free(); diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index 9a3a0d3cd..bb986a716 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index bd2f73467..68cf8d359 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,7 @@ +#define LLAMA_API_INTERNAL +#include "common.h" #include "ggml.h" #include "llama.h" -#include "llama-context.h" -#include "common.h" #include #include @@ -9,9 +9,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -140,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { } static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats ) { if (layer->type == GGML_TYPE_F16) { @@ -154,7 +156,7 @@ static void test_roundtrip_on_chunk( if (use_reference) { qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size); } else { - qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); + qfns.from_float(input_scratch, quantized_scratch, chunk_size); } qfns.to_float(quantized_scratch, output_scratch, chunk_size); @@ -164,7 +166,7 @@ static void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, + std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, std::vector & output_scratch, error_stats & total_error, int max_thread = 0 ) { @@ -185,13 +187,13 @@ static void test_roundtrip_on_layer( int num_chunks = (nelements + chunk_size - 1)/chunk_size; if (num_chunks < 2 || max_thread < 2) { - test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(), + test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), output_scratch.data(), print_layer_stats ? layer_error : total_error); } else { auto & stats = print_layer_stats ? layer_error : total_error; std::mutex mutex; uint64_t counter = 0; - auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr, + auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, &quantized_scratch, &output_scratch, chunk_size] () { error_stats local_stats {}; while (true) { @@ -203,7 +205,7 @@ static void test_roundtrip_on_layer( } lock.unlock(); uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; - test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset, + test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); } }; @@ -309,7 +311,7 @@ int main(int argc, char ** argv) { auto mparams = llama_model_default_params(); mparams.use_mlock = false; - model = llama_model_load_from_file(params.model.c_str(), mparams); + model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); @@ -317,24 +319,25 @@ int main(int argc, char ** argv) { } auto cparams = llama_context_default_params(); - cparams.n_ctx = 256; + cparams.n_ctx = 256; + cparams.seed = 1; - ctx = llama_init_from_model(model, cparams); + ctx = llama_new_context_with_model(model, cparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_model_free(model); + llama_free_model(model); return 1; } } - const auto & tensors = llama_internal_get_tensor_map(ctx); + const auto &tensors = llama_internal_get_tensor_map(ctx); // check layer tensors int included_layers = 0; int64_t max_nelements = 0; bool is_f16 = false; - for (const auto & kv_tensor : tensors) { + for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -347,7 +350,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); - llama_model_free(model); + llama_free_model(model); return 1; } included_layers++; @@ -369,9 +372,8 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - const auto * qfns = ggml_get_type_traits(type); - const auto * qfns_cpu = ggml_get_type_traits_cpu(type); - if (qfns_cpu->from_float && qfns->to_float) { + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + if (qfns.from_float && qfns.to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); } @@ -380,7 +382,7 @@ int main(int argc, char ** argv) { error_stats global_stats {}; - for (const auto & kv_tensor : tensors) { + for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -392,7 +394,7 @@ int main(int argc, char ** argv) { test_roundtrip_on_layer( layer_name, params.per_layer_stats, - *qfns, *qfns_cpu, + qfns, params.reference, kv_tensor.second, input_scratch, @@ -409,7 +411,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - llama_model_free(model); + llama_free_model(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 47e5cbe30..3ee4eb971 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index f9cce7b21..553c2701b 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -34,7 +34,7 @@ Run the quantized model: ```bash # start inference on a gguf model -./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant" +./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8d47b17b6..8d7647258 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,8 +26,6 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", }, - { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, @@ -48,6 +46,9 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, @@ -60,16 +61,6 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; -static bool striequals(const char * a, const char * b) { - while (*a && *b) { - if (std::tolower(*a) != std::tolower(*b)) { - return false; - } - a++; b++; - } - return *a == *b; -} - static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; @@ -77,7 +68,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp ftype_str.push_back(std::toupper(ch)); } for (auto & it : QUANT_OPTIONS) { - if (striequals(it.name.c_str(), ftype_str.c_str())) { + if (it.name == ftype_str) { ftype = it.ftype; ftype_str_out = it.name; return true; @@ -100,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp } // usage: -// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] +// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // [[noreturn]] static void usage(const char * executable) { @@ -113,7 +104,7 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --keep-split: will generate quantized model in the same shards as input\n"); + printf(" --keep-split: will generate quatized model in the same shards as input"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); @@ -232,15 +223,15 @@ static int prepare_imatrix(const std::string & imatrix_file, } static ggml_type parse_ggml_type(const char * arg) { - for (int i = 0; i < GGML_TYPE_COUNT; ++i) { - auto type = (ggml_type)i; + ggml_type result = GGML_TYPE_COUNT; + for (int j = 0; j < GGML_TYPE_COUNT; ++j) { + auto type = ggml_type(j); const auto * name = ggml_type_name(type); - if (name && striequals(name, arg)) { - return type; + if (name && strcmp(arg, name) == 0) { + result = type; break; } } - fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg); - return GGML_TYPE_COUNT; + return result; } int main(int argc, char ** argv) { @@ -261,18 +252,12 @@ int main(int argc, char ** argv) { } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { if (arg_idx < argc-1) { params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.output_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } } else { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { if (arg_idx < argc-1) { params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); - if (params.token_embedding_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } } else { usage(argv[0]); } diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh index 70f7610f9..24bc970e8 100644 --- a/examples/quantize/tests.sh +++ b/examples/quantize/tests.sh @@ -47,7 +47,7 @@ echo PASS echo # 3a. Test the requanted model is loading properly -$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -57,7 +57,7 @@ echo PASS echo # 4b. Test the requanted model is loading properly -$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 echo PASS echo diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt index 512a602ec..66610f311 100644 --- a/examples/retrieval/CMakeLists.txt +++ b/examples/retrieval/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-retrieval) add_executable(${TARGET} retrieval.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 2439022a2..eb89d16da 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -1,16 +1,15 @@ -#include "arg.h" #include "common.h" -#include "log.h" #include "llama.h" #include #include -#include // TODO: remove me -static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); - LOG("\n"); +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); + LOG_TEE("\n"); } struct chunk { @@ -19,7 +18,7 @@ struct chunk { // original file position size_t filepos; // original text data - std::string textdata; + std::string textdata = ""; // tokenized text data std::vector tokens; // embedding @@ -33,14 +32,14 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz std::ifstream f(filename.c_str()); if (!f.is_open()) { - LOG_ERR("could not open file %s\n", filename.c_str()); + fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); return chunks; } chunk current_chunk; char buffer[1024]; int64_t filepos = 0; - std::string current; + std::string current = ""; while (f.read(buffer, 1024)) { current += std::string(buffer, f.gcount()); size_t pos; @@ -77,7 +76,7 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - common_batch_add(batch, tokens[i], i, { seq_id }, true); + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -86,9 +85,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu llama_kv_cache_clear(ctx); // run model - LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); if (llama_decode(ctx, batch) < 0) { - LOG_ERR("%s : failed to decode\n", __func__); + fprintf(stderr, "%s : failed to decode\n", __func__); } for (int i = 0; i < batch.n_tokens; i++) { @@ -101,41 +100,42 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu if (embd == NULL) { embd = llama_get_embeddings_ith(ctx, i); if (embd == NULL) { - LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i); + fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); continue; } } float * out = output + batch.seq_id[i][0] * n_embd; - common_embd_normalize(embd, out, n_embd, 2); + llama_embd_normalize(embd, out, n_embd); } } int main(int argc, char ** argv) { - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - common_init(); - // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; params.embedding = true; if (params.chunk_size <= 0) { - LOG_ERR("chunk_size must be positive\n"); + fprintf(stderr, "chunk_size must be positive\n"); return 1; } if (params.context_files.empty()) { - LOG_ERR("context_files must be specified\n"); + fprintf(stderr, "context_files must be specified\n"); return 1; } - LOG_INF("processing files:\n"); + print_build_info(); + + printf("processing files:\n"); for (auto & context_file : params.context_files) { - LOG_INF("%s\n", context_file.c_str()); + printf("%s\n", context_file.c_str()); } std::vector chunks; @@ -143,42 +143,39 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - LOG_INF("Number of chunks: %zu\n", chunks.size()); + printf("Number of chunks: %ld\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); + llama_model * model; + llama_context * ctx; + // load the model - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); - + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { - LOG_ERR("%s: unable to load model\n", __func__); + fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - LOG_ERR("%s: pooling type NONE not supported\n", __func__); + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } // max batch size @@ -187,15 +184,15 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim for (auto & chunk : chunks) { - auto inp = common_tokenize(ctx, chunk.textdata, true, false); + auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); if (inp.size() > n_batch) { - LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } // add eos if not present - if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) { - inp.push_back(llama_vocab_eos(vocab)); + if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) { + inp.push_back(llama_token_eos(model)); } chunk.tokens = inp; } @@ -203,12 +200,12 @@ int main(int argc, char ** argv) { // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) chunks.size(); i++) { - LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); - LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } - LOG_INF("\n\n"); + fprintf(stderr, "\n\n"); } } @@ -217,7 +214,7 @@ int main(int argc, char ** argv) { struct llama_batch batch = llama_batch_init(n_batch, 0, 1); // allocate output - const int n_embd = llama_model_n_embd(model); + const int n_embd = llama_n_embd(model); std::vector embeddings(n_chunks * n_embd, 0); float * emb = embeddings.data(); @@ -234,7 +231,7 @@ int main(int argc, char ** argv) { if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - common_batch_clear(batch); + llama_batch_clear(batch); p += s; s = 0; } @@ -255,27 +252,26 @@ int main(int argc, char ** argv) { chunks[i].tokens.clear(); } - struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); - // start loop, receive query and return top k similar chunks based on cosine similarity std::string query; while (true) { - LOG("Enter query: "); + printf("Enter query: "); std::getline(std::cin, query); - std::vector query_tokens = common_tokenize(ctx, query, true); + std::vector query_tokens = llama_tokenize(ctx, query, true); + struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); batch_add_seq(query_batch, query_tokens, 0); std::vector query_emb(n_embd, 0); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); - common_batch_clear(query_batch); + llama_batch_clear(query_batch); // compute cosine similarities { std::vector> similarities; for (int i = 0; i < n_chunks; i++) { - float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); + float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); similarities.push_back(std::make_pair(i, sim)); } @@ -284,21 +280,20 @@ int main(int argc, char ** argv) { return a.second > b.second; }); - LOG("Top %d similar chunks:\n", params.sampling.top_k); - for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) { - LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str()); - LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); - LOG("similarity: %f\n", similarities[i].second); - LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); - LOG("--------------------\n"); + printf("Top %d similar chunks:\n", params.sparams.top_k); + for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { + printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); + printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); + printf("similarity: %f\n", similarities[i].second); + printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); + printf("--------------------\n"); } } } - LOG("\n"); - llama_perf_context_print(ctx); - // clean up - llama_batch_free(query_batch); + llama_print_timings(ctx); + llama_free(ctx); + llama_free_model(model); llama_backend_free(); } diff --git a/examples/rpc/README.md b/examples/rpc/README.md index 312bb634d..e1da801f2 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -1,30 +1,25 @@ ## Overview -> [!IMPORTANT] -> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and -> insecure. **Never run the RPC server on an open network or in a sensitive environment!** - The `rpc-server` allows running `ggml` backend on a remote host. The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. This can be used for distributed LLM inference with `llama.cpp` in the following way: ```mermaid flowchart TD - rpcb<-->|TCP|srva - rpcb<-->|TCP|srvb - rpcb<-.->|TCP|srvn + rpcb---|TCP|srva + rpcb---|TCP|srvb + rpcb-.-|TCP|srvn subgraph hostn[Host N] - srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"] + srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] end subgraph hostb[Host B] - srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"] + srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] end subgraph hosta[Host A] - srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"] + srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] end subgraph host[Main Host] - local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli] - ggml[llama-cli]<-->rpcb[RPC backend] + ggml[llama.cpp]---rpcb[RPC backend] end style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 ``` @@ -63,12 +58,17 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. -On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options. -Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`: +On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: + +```bash +mkdir build-rpc +cd build-rpc +cmake .. -DGGML_RPC=ON +cmake --build . --config Release +``` + +Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: ```bash $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 ``` - -This way you can offload model layers to both local and remote devices. - diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 8b1b23eda..7c15d2aa4 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -1,5 +1,3 @@ -#include "ggml-cpu.h" - #ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif @@ -8,14 +6,6 @@ #include "ggml-metal.h" #endif -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -26,7 +16,7 @@ #include struct rpc_server_params { - std::string host = "127.0.0.1"; + std::string host = "0.0.0.0"; int port = 50052; size_t backend_mem = 0; }; @@ -89,18 +79,6 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); } -#elif GGML_USE_VULKAN - fprintf(stderr, "%s: using Vulkan backend\n", __func__); - backend = ggml_backend_vk_init(0); // init device 0 - if (!backend) { - fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__); - } -#elif GGML_USE_SYCL - fprintf(stderr, "%s: using SYCL backend\n", __func__); - backend = ggml_backend_sycl_init(0); // init device 0 - if (!backend) { - fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__); - } #endif // if there aren't GPU Backends fallback to CPU backend @@ -114,10 +92,6 @@ static ggml_backend_t create_backend() { static void get_backend_memory(size_t * free_mem, size_t * total_mem) { #ifdef GGML_USE_CUDA ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); -#elif GGML_USE_VULKAN - ggml_backend_vk_get_device_memory(0, free_mem, total_mem); -#elif GGML_USE_SYCL - ggml_backend_sycl_get_device_memory(0, free_mem, total_mem); #else #ifdef _WIN32 MEMORYSTATUSEX status; @@ -140,17 +114,6 @@ int main(int argc, char * argv[]) { fprintf(stderr, "Invalid parameters\n"); return 1; } - - if (params.host != "127.0.0.1") { - fprintf(stderr, "\n"); - fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); - fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str()); - fprintf(stderr, " Never expose the RPC server to an open network!\n"); - fprintf(stderr, " This is an experimental feature and is not secure!\n"); - fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); - fprintf(stderr, "\n"); - } - ggml_backend_t backend = create_backend(); if (!backend) { fprintf(stderr, "Failed to create backend\n"); @@ -165,7 +128,7 @@ int main(int argc, char * argv[]) { get_backend_memory(&free_mem, &total_mem); } printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); - ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem); + start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); ggml_backend_free(backend); return 0; } diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt deleted file mode 100644 index cd6b0520e..000000000 --- a/examples/run/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-run) -add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/README.md b/examples/run/README.md deleted file mode 100644 index 89a552079..000000000 --- a/examples/run/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# llama.cpp/example/run - -The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. - -```bash -llama-run granite3-moe -``` - -```bash -Description: - Runs a llm - -Usage: - llama-run [options] model [prompt] - -Options: - -c, --context-size - Context size (default: 2048) - -n, -ngl, --ngl - Number of GPU layers (default: 0) - --temp - Temperature (default: 0.8) - -v, --verbose, --log-verbose - Set verbosity level to infinity (i.e. log all messages, useful for debugging) - -h, --help - Show help message - -Commands: - model - Model is a string with an optional prefix of - huggingface:// (hf://), ollama://, https:// or file://. - If no protocol is specified and a file exists in the specified - path, file:// is assumed, otherwise if a file does not exist in - the specified path, ollama:// is assumed. Models that are being - pulled are downloaded with .partial extension while being - downloaded and then renamed as the file without the .partial - extension when complete. - -Examples: - llama-run llama3 - llama-run ollama://granite-code - llama-run ollama://smollm:135m - llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf - llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf - llama-run https://example.com/some-file1.gguf - llama-run some-file2.gguf - llama-run file://some-file3.gguf - llama-run --ngl 999 some-file4.gguf - llama-run --ngl 999 some-file5.gguf Hello World -``` diff --git a/examples/run/linenoise.cpp/LICENSE b/examples/run/linenoise.cpp/LICENSE deleted file mode 100644 index b006b3b24..000000000 --- a/examples/run/linenoise.cpp/LICENSE +++ /dev/null @@ -1,26 +0,0 @@ -Copyright (c) 2010-2014, Salvatore Sanfilippo -Copyright (c) 2010-2013, Pieter Noordhuis -Copyright (c) 2025, Eric Curtin - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/examples/run/linenoise.cpp/linenoise.cpp b/examples/run/linenoise.cpp/linenoise.cpp deleted file mode 100644 index a68f12a1a..000000000 --- a/examples/run/linenoise.cpp/linenoise.cpp +++ /dev/null @@ -1,1350 +0,0 @@ -#ifndef _WIN32 -/* - * You can find the latest source code at: - * - * http://github.com/ericcurtin/linenoise.cpp - * - * Does a number of crazy assumptions that happen to be true in 99.9999% of - * the 2010 UNIX computers around. - * - * ------------------------------------------------------------------------ - * - * Copyright (c) 2010-2023, Salvatore Sanfilippo - * Copyright (c) 2010-2013, Pieter Noordhuis - * Copyright (c) 2025, Eric Curtin - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * ------------------------------------------------------------------------ - * - * References: - * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html - * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html - * - * Todo list: - * - Filter bogus Ctrl+ combinations. - * - Win32 support - * - * Bloat: - * - History search like Ctrl+r in readline? - * - * List of escape sequences used by this program, we do everything just - * with three sequences. In order to be so cheap we may have some - * flickering effect with some slow terminal, but the lesser sequences - * the more compatible. - * - * EL (Erase Line) - * Sequence: ESC [ n K - * Effect: if n is 0 or missing, clear from cursor to end of line - * Effect: if n is 1, clear from beginning of line to cursor - * Effect: if n is 2, clear entire line - * - * CUF (CUrsor Forward) - * Sequence: ESC [ n C - * Effect: moves cursor forward n chars - * - * CUB (CUrsor Backward) - * Sequence: ESC [ n D - * Effect: moves cursor backward n chars - * - * The following is used to get the terminal width if getting - * the width with the TIOCGWINSZ ioctl fails - * - * DSR (Device Status Report) - * Sequence: ESC [ 6 n - * Effect: reports the current cusor position as ESC [ n ; m R - * where n is the row and m is the column - * - * When multi line mode is enabled, we also use an additional escape - * sequence. However multi line editing is disabled by default. - * - * CUU (Cursor Up) - * Sequence: ESC [ n A - * Effect: moves cursor up of n chars. - * - * CUD (Cursor Down) - * Sequence: ESC [ n B - * Effect: moves cursor down of n chars. - * - * When linenoiseClearScreen() is called, two additional escape sequences - * are used in order to clear the screen and position the cursor at home - * position. - * - * CUP (Cursor position) - * Sequence: ESC [ H - * Effect: moves the cursor to upper left corner - * - * ED (Erase display) - * Sequence: ESC [ 2 J - * Effect: clear the whole screen - * - */ - -# include "linenoise.h" - -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include - -# include -# include -# include - -# define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100 -# define LINENOISE_MAX_LINE 4096 -static std::vector unsupported_term = { "dumb", "cons25", "emacs" }; -static linenoiseCompletionCallback *completionCallback = NULL; -static linenoiseHintsCallback *hintsCallback = NULL; -static linenoiseFreeHintsCallback *freeHintsCallback = NULL; -static char *linenoiseNoTTY(void); -static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags); -static void refreshLineWithFlags(struct linenoiseState *l, int flags); - -static struct termios orig_termios; /* In order to restore at exit.*/ -static int maskmode = 0; /* Show "***" instead of input. For passwords. */ -static int rawmode = 0; /* For atexit() function to check if restore is needed*/ -static int mlmode = 0; /* Multi line mode. Default is single line. */ -static int atexit_registered = 0; /* Register atexit just 1 time. */ -static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; -static int history_len = 0; -static char **history = NULL; - -enum KEY_ACTION{ - KEY_NULL = 0, /* NULL */ - CTRL_A = 1, /* Ctrl+a */ - CTRL_B = 2, /* Ctrl-b */ - CTRL_C = 3, /* Ctrl-c */ - CTRL_D = 4, /* Ctrl-d */ - CTRL_E = 5, /* Ctrl-e */ - CTRL_F = 6, /* Ctrl-f */ - CTRL_H = 8, /* Ctrl-h */ - TAB = 9, /* Tab */ - CTRL_K = 11, /* Ctrl+k */ - CTRL_L = 12, /* Ctrl+l */ - ENTER = 13, /* Enter */ - CTRL_N = 14, /* Ctrl-n */ - CTRL_P = 16, /* Ctrl-p */ - CTRL_T = 20, /* Ctrl-t */ - CTRL_U = 21, /* Ctrl+u */ - CTRL_W = 23, /* Ctrl+w */ - ESC = 27, /* Escape */ - BACKSPACE = 127 /* Backspace */ -}; - -static void linenoiseAtExit(void); -int linenoiseHistoryAdd(const char *line); -#define REFRESH_CLEAN (1<<0) // Clean the old prompt from the screen -#define REFRESH_WRITE (1<<1) // Rewrite the prompt on the screen. -#define REFRESH_ALL (REFRESH_CLEAN|REFRESH_WRITE) // Do both. -static void refreshLine(struct linenoiseState *l); - -class File { - public: - FILE * file = nullptr; - - FILE * open(const std::string & filename, const char * mode) { - file = fopen(filename.c_str(), mode); - - return file; - } - - int lock() { - if (file) { - fd = fileno(file); - if (flock(fd, LOCK_EX | LOCK_NB) != 0) { - fd = -1; - - return 1; - } - } - - return 0; - } - - ~File() { - if (fd >= 0) { - flock(fd, LOCK_UN); - } - - if (file) { - fclose(file); - } - } - - private: - int fd = -1; -}; - -__attribute__((format(printf, 1, 2))) -/* Debugging function. */ -#if 0 -static void lndebug(const char *fmt, ...) { - static File file; - if (file.file == nullptr) { - file.open("/tmp/lndebug.txt", "a"); - } - - if (file.file != nullptr) { - va_list args; - va_start(args, fmt); - vfprintf(file.file, fmt, args); - va_end(args); - fflush(file.file); - } -} -#else -static void lndebug(const char *, ...) { -} -#endif - -/* ======================= Low level terminal handling ====================== */ - -/* Enable "mask mode". When it is enabled, instead of the input that - * the user is typing, the terminal will just display a corresponding - * number of asterisks, like "****". This is useful for passwords and other - * secrets that should not be displayed. */ -void linenoiseMaskModeEnable(void) { - maskmode = 1; -} - -/* Disable mask mode. */ -void linenoiseMaskModeDisable(void) { - maskmode = 0; -} - -/* Set if to use or not the multi line mode. */ -void linenoiseSetMultiLine(int ml) { - mlmode = ml; -} - -/* Return true if the terminal name is in the list of terminals we know are - * not able to understand basic escape sequences. */ -static int isUnsupportedTerm(void) { - char *term = getenv("TERM"); - if (term == NULL) return 0; - for (size_t j = 0; j < unsupported_term.size(); ++j) { - if (!strcasecmp(term, unsupported_term[j])) { - return 1; - } - } - return 0; -} - -/* Raw mode: 1960 magic shit. */ -static int enableRawMode(int fd) { - struct termios raw; - - if (!isatty(STDIN_FILENO)) goto fatal; - if (!atexit_registered) { - atexit(linenoiseAtExit); - atexit_registered = 1; - } - if (tcgetattr(fd,&orig_termios) == -1) goto fatal; - - raw = orig_termios; /* modify the original mode */ - /* input modes: no break, no CR to NL, no parity check, no strip char, - * no start/stop output control. */ - raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON); - /* output modes - disable post processing */ - raw.c_oflag &= ~(OPOST); - /* control modes - set 8 bit chars */ - raw.c_cflag |= (CS8); - /* local modes - choing off, canonical off, no extended functions, - * no signal chars (^Z,^C) */ - raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG); - /* control chars - set return condition: min number of bytes and timer. - * We want read to return every single byte, without timeout. */ - raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */ - - /* put terminal in raw mode after flushing */ - if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal; - rawmode = 1; - return 0; - -fatal: - errno = ENOTTY; - return -1; -} - -static void disableRawMode(int fd) { - /* Don't even check the return value as it's too late. */ - if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) - rawmode = 0; -} - -/* Use the ESC [6n escape sequence to query the horizontal cursor position - * and return it. On error -1 is returned, on success the position of the - * cursor. */ -static int getCursorPosition(int ifd, int ofd) { - char buf[32]; - int cols, rows; - unsigned int i = 0; - - /* Report cursor location */ - if (write(ofd, "\x1b[6n", 4) != 4) return -1; - - /* Read the response: ESC [ rows ; cols R */ - while (i < sizeof(buf)-1) { - if (read(ifd,buf+i,1) != 1) break; - if (buf[i] == 'R') break; - i++; - } - buf[i] = '\0'; - - /* Parse it. */ - if (buf[0] != ESC || buf[1] != '[') return -1; - if (sscanf(buf+2,"%d;%d",&rows,&cols) != 2) return -1; - return cols; -} - -/* Try to get the number of columns in the current terminal, or assume 80 - * if it fails. */ -static int getColumns(int ifd, int ofd) { - struct winsize ws; - - if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) { - /* ioctl() failed. Try to query the terminal itself. */ - int start, cols; - - /* Get the initial position so we can restore it later. */ - start = getCursorPosition(ifd,ofd); - if (start == -1) goto failed; - - /* Go to right margin and get position. */ - if (write(ofd,"\x1b[999C",6) != 6) goto failed; - cols = getCursorPosition(ifd,ofd); - if (cols == -1) goto failed; - - /* Restore position. */ - if (cols > start) { - char seq[32]; - snprintf(seq,32,"\x1b[%dD",cols-start); - if (write(ofd,seq,strlen(seq)) == -1) { - /* Can't recover... */ - } - } - return cols; - } else { - return ws.ws_col; - } - -failed: - return 80; -} - -/* Clear the screen. Used to handle ctrl+l */ -void linenoiseClearScreen(void) { - if (write(STDOUT_FILENO,"\x1b[H\x1b[2J",7) <= 0) { - /* nothing to do, just to avoid warning. */ - } -} - -/* Beep, used for completion when there is nothing to complete or when all - * the choices were already shown. */ -static void linenoiseBeep(void) { - fprintf(stderr, "\x7"); - fflush(stderr); -} - -/* Called by completeLine() and linenoiseShow() to render the current - * edited line with the proposed completion. If the current completion table - * is already available, it is passed as second argument, otherwise the - * function will use the callback to obtain it. - * - * Flags are the same as refreshLine*(), that is REFRESH_* macros. */ -static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags) { - /* Obtain the table of completions if the caller didn't provide one. */ - linenoiseCompletions ctable; - if (lc == NULL) { - completionCallback(ls->buf, &ctable); - lc = &ctable; - } - - /* Show the edited line with completion if possible, or just refresh. */ - if (ls->completion_idx < lc->len) { - struct linenoiseState saved = *ls; - ls->len = ls->pos = strlen(lc->cvec[ls->completion_idx]); - ls->buf = lc->cvec[ls->completion_idx]; - refreshLineWithFlags(ls, flags); - ls->len = saved.len; - ls->pos = saved.pos; - ls->buf = saved.buf; - } else { - refreshLineWithFlags(ls, flags); - } - - if (lc == &ctable) { - ctable.to_free = false; - } -} - -/* This is an helper function for linenoiseEdit*() and is called when the - * user types the key in order to complete the string currently in the - * input. - * - * The state of the editing is encapsulated into the pointed linenoiseState - * structure as described in the structure definition. - * - * If the function returns non-zero, the caller should handle the - * returned value as a byte read from the standard input, and process - * it as usually: this basically means that the function may return a byte - * read from the termianl but not processed. Otherwise, if zero is returned, - * the input was consumed by the completeLine() function to navigate the - * possible completions, and the caller should read for the next characters - * from stdin. */ -static int completeLine(struct linenoiseState *ls, int keypressed) { - linenoiseCompletions lc; - int nwritten; - char c = keypressed; - - completionCallback(ls->buf, &lc); - if (lc.len == 0) { - linenoiseBeep(); - ls->in_completion = 0; - } else { - switch(c) { - case 9: /* tab */ - if (ls->in_completion == 0) { - ls->in_completion = 1; - ls->completion_idx = 0; - } else { - ls->completion_idx = (ls->completion_idx + 1) % (lc.len + 1); - if (ls->completion_idx == lc.len) linenoiseBeep(); - } - c = 0; - break; - case 27: /* escape */ - /* Re-show original buffer */ - if (ls->completion_idx < lc.len) refreshLine(ls); - ls->in_completion = 0; - c = 0; - break; - default: - /* Update buffer and return */ - if (ls->completion_idx < lc.len) { - nwritten = snprintf(ls->buf, ls->buflen, "%s", lc.cvec[ls->completion_idx]); - ls->len = ls->pos = nwritten; - } - ls->in_completion = 0; - break; - } - - /* Show completion or original buffer */ - if (ls->in_completion && ls->completion_idx < lc.len) { - refreshLineWithCompletion(ls, &lc, REFRESH_ALL); - } else { - refreshLine(ls); - } - } - - return c; /* Return last read character */ -} - -/* Register a callback function to be called for tab-completion. */ -void linenoiseSetCompletionCallback(linenoiseCompletionCallback *fn) { - completionCallback = fn; -} - -/* Register a hits function to be called to show hits to the user at the - * right of the prompt. */ -void linenoiseSetHintsCallback(linenoiseHintsCallback *fn) { - hintsCallback = fn; -} - -/* Register a function to free the hints returned by the hints callback - * registered with linenoiseSetHintsCallback(). */ -void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *fn) { - freeHintsCallback = fn; -} - -/* This function is used by the callback function registered by the user - * in order to add completion options given the input string when the - * user typed . See the example.c source code for a very easy to - * understand example. */ -void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) { - const size_t len = strlen(str); - auto copy = std::make_unique(len + 1); - if (!copy) { - return; - } - - memcpy(copy.get(), str, len + 1); - char ** cvec = static_cast(std::realloc(lc->cvec, sizeof(char *) * (lc->len + 1))); - if (cvec == nullptr) { - return; - } - - lc->cvec = cvec; - lc->cvec[lc->len++] = copy.release(); -} - -/* Helper of refreshSingleLine() and refreshMultiLine() to show hints - * to the right of the prompt. */ -static void refreshShowHints(std::string & ab, struct linenoiseState * l, int plen) { - char seq[64]; - if (hintsCallback && plen+l->len < l->cols) { - int color = -1, bold = 0; - const char *hint = hintsCallback(l->buf,&color,&bold); - if (hint) { - int hintlen = strlen(hint); - int hintmaxlen = l->cols-(plen+l->len); - if (hintlen > hintmaxlen) hintlen = hintmaxlen; - if (bold == 1 && color == -1) color = 37; - if (color != -1 || bold != 0) - snprintf(seq,64,"\033[%d;%d;49m",bold,color); - else - seq[0] = '\0'; - ab.append(seq); - ab.append(hint, hintlen); - if (color != -1 || bold != 0) - ab.append("\033[0m"); - - /* Call the function to free the hint returned. */ - if (freeHintsCallback) freeHintsCallback(hint); - } - } -} - -/* Single line low level line refresh. - * - * Rewrite the currently edited line accordingly to the buffer content, - * cursor position, and number of columns of the terminal. - * - * Flags is REFRESH_* macros. The function can just remove the old - * prompt, just write it, or both. */ -static void refreshSingleLine(struct linenoiseState *l, int flags) { - char seq[64]; - size_t plen = strlen(l->prompt); - int fd = l->ofd; - char *buf = l->buf; - size_t len = l->len; - size_t pos = l->pos; - std::string ab; - while((plen+pos) >= l->cols) { - buf++; - len--; - pos--; - } - while (plen+len > l->cols) { - len--; - } - - /* Cursor to left edge */ - snprintf(seq,sizeof(seq),"\r"); - ab.append(seq); - - if (flags & REFRESH_WRITE) { - /* Write the prompt and the current buffer content */ - ab.append(l->prompt); - if (maskmode == 1) { - while (len--) { - ab.append("*"); - } - } else { - ab.append(buf, len); - } - /* Show hits if any. */ - refreshShowHints(ab, l, plen); - } - - /* Erase to right */ - snprintf(seq,sizeof(seq),"\x1b[0K"); - ab.append(seq); - if (flags & REFRESH_WRITE) { - /* Move cursor to original position. */ - snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen)); - ab.append(seq); - } - - (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */ -} - -/* Multi line low level line refresh. - * - * Rewrite the currently edited line accordingly to the buffer content, - * cursor position, and number of columns of the terminal. - * - * Flags is REFRESH_* macros. The function can just remove the old - * prompt, just write it, or both. */ -static void refreshMultiLine(struct linenoiseState *l, int flags) { - char seq[64]; - int plen = strlen(l->prompt); - int rows = (plen+l->len+l->cols-1)/l->cols; /* rows used by current buf. */ - int rpos = (plen+l->oldpos+l->cols)/l->cols; /* cursor relative row. */ - int rpos2; /* rpos after refresh. */ - int col; /* colum position, zero-based. */ - int old_rows = l->oldrows; - int fd = l->ofd, j; - std::string ab; - l->oldrows = rows; - - /* First step: clear all the lines used before. To do so start by - * going to the last row. */ - if (flags & REFRESH_CLEAN) { - if (old_rows-rpos > 0) { - lndebug("go down %d", old_rows-rpos); - snprintf(seq,64,"\x1b[%dB", old_rows-rpos); - ab.append(seq); - } - - /* Now for every row clear it, go up. */ - for (j = 0; j < old_rows-1; j++) { - lndebug("clear+up"); - snprintf(seq,64,"\r\x1b[0K\x1b[1A"); - ab.append(seq); - } - } - - if (flags & REFRESH_ALL) { - /* Clean the top line. */ - lndebug("clear"); - snprintf(seq,64,"\r\x1b[0K"); - ab.append(seq); - } - - if (flags & REFRESH_WRITE) { - /* Write the prompt and the current buffer content */ - ab.append(l->prompt); - if (maskmode == 1) { - for (unsigned int i = 0; i < l->len; ++i) { - ab.append("*"); - } - } else { - ab.append(l->buf, l->len); - } - - /* Show hits if any. */ - refreshShowHints(ab, l, plen); - - /* If we are at the very end of the screen with our prompt, we need to - * emit a newline and move the prompt to the first column. */ - if (l->pos && - l->pos == l->len && - (l->pos+plen) % l->cols == 0) - { - lndebug(""); - ab.append("\n"); - snprintf(seq,64,"\r"); - ab.append(seq); - rows++; - if (rows > (int)l->oldrows) l->oldrows = rows; - } - - /* Move cursor to right position. */ - rpos2 = (plen+l->pos+l->cols)/l->cols; /* Current cursor relative row */ - lndebug("rpos2 %d", rpos2); - - /* Go up till we reach the expected positon. */ - if (rows-rpos2 > 0) { - lndebug("go-up %d", rows-rpos2); - snprintf(seq,64,"\x1b[%dA", rows-rpos2); - ab.append(seq); - } - - /* Set column. */ - col = (plen+(int)l->pos) % (int)l->cols; - lndebug("set col %d", 1+col); - if (col) - snprintf(seq,64,"\r\x1b[%dC", col); - else - snprintf(seq,64,"\r"); - ab.append(seq); - } - - lndebug("\n"); - l->oldpos = l->pos; - (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */ -} - -/* Calls the two low level functions refreshSingleLine() or - * refreshMultiLine() according to the selected mode. */ -static void refreshLineWithFlags(struct linenoiseState *l, int flags) { - if (mlmode) - refreshMultiLine(l,flags); - else - refreshSingleLine(l,flags); -} - -/* Utility function to avoid specifying REFRESH_ALL all the times. */ -static void refreshLine(struct linenoiseState *l) { - refreshLineWithFlags(l,REFRESH_ALL); -} - -/* Hide the current line, when using the multiplexing API. */ -void linenoiseHide(struct linenoiseState *l) { - if (mlmode) - refreshMultiLine(l,REFRESH_CLEAN); - else - refreshSingleLine(l,REFRESH_CLEAN); -} - -/* Show the current line, when using the multiplexing API. */ -void linenoiseShow(struct linenoiseState *l) { - if (l->in_completion) { - refreshLineWithCompletion(l,NULL,REFRESH_WRITE); - } else { - refreshLineWithFlags(l,REFRESH_WRITE); - } -} - -/* Insert the character 'c' at cursor current position. - * - * On error writing to the terminal -1 is returned, otherwise 0. */ -static int linenoiseEditInsert(struct linenoiseState * l, char c) { - if (l->len < l->buflen) { - if (l->len == l->pos) { - l->buf[l->pos] = c; - l->pos++; - l->len++; - l->buf[l->len] = '\0'; - if ((!mlmode && l->plen+l->len < l->cols && !hintsCallback)) { - /* Avoid a full update of the line in the - * trivial case. */ - char d = (maskmode==1) ? '*' : c; - if (write(l->ofd,&d,1) == -1) return -1; - } else { - refreshLine(l); - } - } else { - memmove(l->buf+l->pos+1,l->buf+l->pos,l->len-l->pos); - l->buf[l->pos] = c; - l->len++; - l->pos++; - l->buf[l->len] = '\0'; - refreshLine(l); - } - } - return 0; -} - -/* Move cursor on the left. */ -static void linenoiseEditMoveLeft(struct linenoiseState * l) { - if (l->pos > 0) { - l->pos--; - refreshLine(l); - } -} - -/* Move cursor on the right. */ -static void linenoiseEditMoveRight(struct linenoiseState * l) { - if (l->pos != l->len) { - l->pos++; - refreshLine(l); - } -} - -/* Move cursor to the start of the line. */ -static void linenoiseEditMoveHome(struct linenoiseState * l) { - if (l->pos != 0) { - l->pos = 0; - refreshLine(l); - } -} - -/* Move cursor to the end of the line. */ -static void linenoiseEditMoveEnd(struct linenoiseState * l) { - if (l->pos != l->len) { - l->pos = l->len; - refreshLine(l); - } -} - -/* Substitute the currently edited line with the next or previous history - * entry as specified by 'dir'. */ -#define LINENOISE_HISTORY_NEXT 0 -#define LINENOISE_HISTORY_PREV 1 - -static void linenoiseEditHistoryNext(struct linenoiseState * l, int dir) { - if (history_len > 1) { - /* Update the current history entry before to - * overwrite it with the next one. */ - free(history[history_len - 1 - l->history_index]); - history[history_len - 1 - l->history_index] = strdup(l->buf); - /* Show the new entry */ - l->history_index += (dir == LINENOISE_HISTORY_PREV) ? 1 : -1; - if (l->history_index < 0) { - l->history_index = 0; - return; - } else if (l->history_index >= history_len) { - l->history_index = history_len-1; - return; - } - strncpy(l->buf,history[history_len - 1 - l->history_index],l->buflen); - l->buf[l->buflen-1] = '\0'; - l->len = l->pos = strlen(l->buf); - refreshLine(l); - } -} - -/* Delete the character at the right of the cursor without altering the cursor - * position. Basically this is what happens with the "Delete" keyboard key. */ -static void linenoiseEditDelete(struct linenoiseState * l) { - if (l->len > 0 && l->pos < l->len) { - memmove(l->buf+l->pos,l->buf+l->pos+1,l->len-l->pos-1); - l->len--; - l->buf[l->len] = '\0'; - refreshLine(l); - } -} - -/* Backspace implementation. */ -static void linenoiseEditBackspace(struct linenoiseState * l) { - if (l->pos > 0 && l->len > 0) { - memmove(l->buf+l->pos-1,l->buf+l->pos,l->len-l->pos); - l->pos--; - l->len--; - l->buf[l->len] = '\0'; - refreshLine(l); - } -} - -/* Delete the previosu word, maintaining the cursor at the start of the - * current word. */ -static void linenoiseEditDeletePrevWord(struct linenoiseState * l) { - size_t old_pos = l->pos; - size_t diff; - - while (l->pos > 0 && l->buf[l->pos-1] == ' ') - l->pos--; - while (l->pos > 0 && l->buf[l->pos-1] != ' ') - l->pos--; - diff = old_pos - l->pos; - memmove(l->buf+l->pos,l->buf+old_pos,l->len-old_pos+1); - l->len -= diff; - refreshLine(l); -} - -/* This function is part of the multiplexed API of Linenoise, that is used - * in order to implement the blocking variant of the API but can also be - * called by the user directly in an event driven program. It will: - * - * 1. Initialize the linenoise state passed by the user. - * 2. Put the terminal in RAW mode. - * 3. Show the prompt. - * 4. Return control to the user, that will have to call linenoiseEditFeed() - * each time there is some data arriving in the standard input. - * - * The user can also call linenoiseEditHide() and linenoiseEditShow() if it - * is required to show some input arriving asyncronously, without mixing - * it with the currently edited line. - * - * When linenoiseEditFeed() returns non-NULL, the user finished with the - * line editing session (pressed enter CTRL-D/C): in this case the caller - * needs to call linenoiseEditStop() to put back the terminal in normal - * mode. This will not destroy the buffer, as long as the linenoiseState - * is still valid in the context of the caller. - * - * The function returns 0 on success, or -1 if writing to standard output - * fails. If stdin_fd or stdout_fd are set to -1, the default is to use - * STDIN_FILENO and STDOUT_FILENO. - */ -int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) { - /* Populate the linenoise state that we pass to functions implementing - * specific editing functionalities. */ - l->in_completion = 0; - l->ifd = stdin_fd != -1 ? stdin_fd : STDIN_FILENO; - l->ofd = stdout_fd != -1 ? stdout_fd : STDOUT_FILENO; - l->buf = buf; - l->buflen = buflen; - l->prompt = prompt; - l->plen = strlen(prompt); - l->oldpos = l->pos = 0; - l->len = 0; - - /* Enter raw mode. */ - if (enableRawMode(l->ifd) == -1) return -1; - - l->cols = getColumns(stdin_fd, stdout_fd); - l->oldrows = 0; - l->history_index = 0; - - /* Buffer starts empty. */ - l->buf[0] = '\0'; - l->buflen--; /* Make sure there is always space for the nulterm */ - - /* If stdin is not a tty, stop here with the initialization. We - * will actually just read a line from standard input in blocking - * mode later, in linenoiseEditFeed(). */ - if (!isatty(l->ifd)) return 0; - - /* The latest history entry is always our current buffer, that - * initially is just an empty string. */ - linenoiseHistoryAdd(""); - - if (write(l->ofd,prompt,l->plen) == -1) return -1; - return 0; -} - -const char* linenoiseEditMore = "If you see this, you are misusing the API: when linenoiseEditFeed() is called, if it returns linenoiseEditMore the user is yet editing the line. See the README file for more information."; - -/* This function is part of the multiplexed API of linenoise, see the top - * comment on linenoiseEditStart() for more information. Call this function - * each time there is some data to read from the standard input file - * descriptor. In the case of blocking operations, this function can just be - * called in a loop, and block. - * - * The function returns linenoiseEditMore to signal that line editing is still - * in progress, that is, the user didn't yet pressed enter / CTRL-D. Otherwise - * the function returns the pointer to the heap-allocated buffer with the - * edited line, that the user should free with linenoiseFree(). - * - * On special conditions, NULL is returned and errno is populated: - * - * EAGAIN if the user pressed Ctrl-C - * ENOENT if the user pressed Ctrl-D - * - * Some other errno: I/O error. - */ -const char *linenoiseEditFeed(struct linenoiseState *l) { - /* Not a TTY, pass control to line reading without character - * count limits. */ - if (!isatty(l->ifd)) return linenoiseNoTTY(); - - char c; - int nread; - char seq[3]; - - nread = read(l->ifd,&c,1); - if (nread <= 0) return NULL; - - /* Only autocomplete when the callback is set. It returns < 0 when - * there was an error reading from fd. Otherwise it will return the - * character that should be handled next. */ - if ((l->in_completion || c == 9) && completionCallback != NULL) { - c = completeLine(l,c); - /* Read next character when 0 */ - if (c == 0) return linenoiseEditMore; - } - - switch(c) { - case ENTER: /* enter */ - history_len--; - free(history[history_len]); - if (mlmode) linenoiseEditMoveEnd(l); - if (hintsCallback) { - /* Force a refresh without hints to leave the previous - * line as the user typed it after a newline. */ - linenoiseHintsCallback *hc = hintsCallback; - hintsCallback = NULL; - refreshLine(l); - hintsCallback = hc; - } - return strdup(l->buf); - case CTRL_C: /* ctrl-c */ - errno = EAGAIN; - return NULL; - case BACKSPACE: /* backspace */ - case 8: /* ctrl-h */ - linenoiseEditBackspace(l); - break; - case CTRL_D: /* ctrl-d, remove char at right of cursor, or if the - line is empty, act as end-of-file. */ - if (l->len > 0) { - linenoiseEditDelete(l); - } else { - history_len--; - free(history[history_len]); - errno = ENOENT; - return NULL; - } - break; - case CTRL_T: /* ctrl-t, swaps current character with previous. */ - if (l->pos > 0 && l->pos < l->len) { - int aux = l->buf[l->pos-1]; - l->buf[l->pos-1] = l->buf[l->pos]; - l->buf[l->pos] = aux; - if (l->pos != l->len-1) l->pos++; - refreshLine(l); - } - break; - case CTRL_B: /* ctrl-b */ - linenoiseEditMoveLeft(l); - break; - case CTRL_F: /* ctrl-f */ - linenoiseEditMoveRight(l); - break; - case CTRL_P: /* ctrl-p */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); - break; - case CTRL_N: /* ctrl-n */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); - break; - case ESC: /* escape sequence */ - /* Read the next two bytes representing the escape sequence. - * Use two calls to handle slow terminals returning the two - * chars at different times. */ - if (read(l->ifd,seq,1) == -1) break; - if (read(l->ifd,seq+1,1) == -1) break; - - /* ESC [ sequences. */ - if (seq[0] == '[') { - if (seq[1] >= '0' && seq[1] <= '9') { - /* Extended escape, read additional byte. */ - if (read(l->ifd,seq+2,1) == -1) break; - if (seq[2] == '~') { - switch(seq[1]) { - case '3': /* Delete key. */ - linenoiseEditDelete(l); - break; - } - } - } else { - switch(seq[1]) { - case 'A': /* Up */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); - break; - case 'B': /* Down */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); - break; - case 'C': /* Right */ - linenoiseEditMoveRight(l); - break; - case 'D': /* Left */ - linenoiseEditMoveLeft(l); - break; - case 'H': /* Home */ - linenoiseEditMoveHome(l); - break; - case 'F': /* End*/ - linenoiseEditMoveEnd(l); - break; - } - } - } - - /* ESC O sequences. */ - else if (seq[0] == 'O') { - switch(seq[1]) { - case 'H': /* Home */ - linenoiseEditMoveHome(l); - break; - case 'F': /* End*/ - linenoiseEditMoveEnd(l); - break; - } - } - break; - default: - if (linenoiseEditInsert(l,c)) return NULL; - break; - case CTRL_U: /* Ctrl+u, delete the whole line. */ - l->buf[0] = '\0'; - l->pos = l->len = 0; - refreshLine(l); - break; - case CTRL_K: /* Ctrl+k, delete from current to end of line. */ - l->buf[l->pos] = '\0'; - l->len = l->pos; - refreshLine(l); - break; - case CTRL_A: /* Ctrl+a, go to the start of the line */ - linenoiseEditMoveHome(l); - break; - case CTRL_E: /* ctrl+e, go to the end of the line */ - linenoiseEditMoveEnd(l); - break; - case CTRL_L: /* ctrl+l, clear screen */ - linenoiseClearScreen(); - refreshLine(l); - break; - case CTRL_W: /* ctrl+w, delete previous word */ - linenoiseEditDeletePrevWord(l); - break; - } - return linenoiseEditMore; -} - -/* This is part of the multiplexed linenoise API. See linenoiseEditStart() - * for more information. This function is called when linenoiseEditFeed() - * returns something different than NULL. At this point the user input - * is in the buffer, and we can restore the terminal in normal mode. */ -void linenoiseEditStop(struct linenoiseState *l) { - if (!isatty(l->ifd)) return; - disableRawMode(l->ifd); - printf("\n"); -} - -/* This just implements a blocking loop for the multiplexed API. - * In many applications that are not event-drivern, we can just call - * the blocking linenoise API, wait for the user to complete the editing - * and return the buffer. */ -static const char *linenoiseBlockingEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) -{ - struct linenoiseState l; - - /* Editing without a buffer is invalid. */ - if (buflen == 0) { - errno = EINVAL; - return NULL; - } - - linenoiseEditStart(&l,stdin_fd,stdout_fd,buf,buflen,prompt); - const char *res; - while((res = linenoiseEditFeed(&l)) == linenoiseEditMore); - linenoiseEditStop(&l); - return res; -} - -/* This special mode is used by linenoise in order to print scan codes - * on screen for debugging / development purposes. It is implemented - * by the linenoise_example program using the --keycodes option. */ -void linenoisePrintKeyCodes(void) { - char quit[4]; - - printf("Linenoise key codes debugging mode.\n" - "Press keys to see scan codes. Type 'quit' at any time to exit.\n"); - if (enableRawMode(STDIN_FILENO) == -1) return; - memset(quit,' ',4); - while(1) { - char c; - int nread; - - nread = read(STDIN_FILENO,&c,1); - if (nread <= 0) continue; - memmove(quit,quit+1,sizeof(quit)-1); /* shift string to left. */ - quit[sizeof(quit)-1] = c; /* Insert current char on the right. */ - if (memcmp(quit,"quit",sizeof(quit)) == 0) break; - - printf("'%c' %02x (%d) (type quit to exit)\n", - isprint(c) ? c : '?', (int)c, (int)c); - printf("\r"); /* Go left edge manually, we are in raw mode. */ - fflush(stdout); - } - disableRawMode(STDIN_FILENO); -} - -/* This function is called when linenoise() is called with the standard - * input file descriptor not attached to a TTY. So for example when the - * program using linenoise is called in pipe or with a file redirected - * to its standard input. In this case, we want to be able to return the - * line regardless of its length (by default we are limited to 4k). */ -static char *linenoiseNoTTY(void) { - char *line = NULL; - size_t len = 0, maxlen = 0; - - while(1) { - if (len == maxlen) { - if (maxlen == 0) maxlen = 16; - maxlen *= 2; - char *oldval = line; - line = (char*) realloc(line,maxlen); - if (line == NULL) { - if (oldval) free(oldval); - return NULL; - } - } - int c = fgetc(stdin); - if (c == EOF || c == '\n') { - if (c == EOF && len == 0) { - free(line); - return NULL; - } else { - line[len] = '\0'; - return line; - } - } else { - line[len] = c; - len++; - } - } -} - -/* The high level function that is the main API of the linenoise library. - * This function checks if the terminal has basic capabilities, just checking - * for a blacklist of stupid terminals, and later either calls the line - * editing function or uses dummy fgets() so that you will be able to type - * something even in the most desperate of the conditions. */ -const char *linenoise(const char *prompt) { - char buf[LINENOISE_MAX_LINE]; - - if (!isatty(STDIN_FILENO)) { - /* Not a tty: read from file / pipe. In this mode we don't want any - * limit to the line size, so we call a function to handle that. */ - return linenoiseNoTTY(); - } else if (isUnsupportedTerm()) { - size_t len; - - printf("%s",prompt); - fflush(stdout); - if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL; - len = strlen(buf); - while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) { - len--; - buf[len] = '\0'; - } - return strdup(buf); - } else { - const char *retval = linenoiseBlockingEdit(STDIN_FILENO,STDOUT_FILENO,buf,LINENOISE_MAX_LINE,prompt); - return retval; - } -} - -/* This is just a wrapper the user may want to call in order to make sure - * the linenoise returned buffer is freed with the same allocator it was - * created with. Useful when the main program is using an alternative - * allocator. */ -void linenoiseFree(void *ptr) { - if (ptr == linenoiseEditMore) return; // Protect from API misuse. - free(ptr); -} - -/* ================================ History ================================= */ - -/* Free the history, but does not reset it. Only used when we have to - * exit() to avoid memory leaks are reported by valgrind & co. */ -static void freeHistory(void) { - if (history) { - int j; - - for (j = 0; j < history_len; j++) - free(history[j]); - free(history); - } -} - -/* At exit we'll try to fix the terminal to the initial conditions. */ -static void linenoiseAtExit(void) { - disableRawMode(STDIN_FILENO); - freeHistory(); -} - -/* This is the API call to add a new entry in the linenoise history. - * It uses a fixed array of char pointers that are shifted (memmoved) - * when the history max length is reached in order to remove the older - * entry and make room for the new one, so it is not exactly suitable for huge - * histories, but will work well for a few hundred of entries. - * - * Using a circular buffer is smarter, but a bit more complex to handle. */ -int linenoiseHistoryAdd(const char *line) { - char *linecopy; - - if (history_max_len == 0) return 0; - - /* Initialization on first call. */ - if (history == NULL) { - history = (char**) malloc(sizeof(char*)*history_max_len); - if (history == NULL) return 0; - memset(history,0,(sizeof(char*)*history_max_len)); - } - - /* Don't add duplicated lines. */ - if (history_len && !strcmp(history[history_len-1], line)) return 0; - - /* Add an heap allocated copy of the line in the history. - * If we reached the max length, remove the older line. */ - linecopy = strdup(line); - if (!linecopy) return 0; - if (history_len == history_max_len) { - free(history[0]); - memmove(history,history+1,sizeof(char*)*(history_max_len-1)); - history_len--; - } - history[history_len] = linecopy; - history_len++; - return 1; -} - -/* Set the maximum length for the history. This function can be called even - * if there is already some history, the function will make sure to retain - * just the latest 'len' elements if the new history length value is smaller - * than the amount of items already inside the history. */ -int linenoiseHistorySetMaxLen(int len) { - char **new_ptr; - - if (len < 1) return 0; - if (history) { - int tocopy = history_len; - - new_ptr = (char**) malloc(sizeof(char*)*len); - if (new_ptr == NULL) return 0; - - /* If we can't copy everything, free the elements we'll not use. */ - if (len < tocopy) { - int j; - - for (j = 0; j < tocopy-len; j++) free(history[j]); - tocopy = len; - } - memset(new_ptr,0,sizeof(char*)*len); - memcpy(new_ptr,history+(history_len-tocopy), sizeof(char*)*tocopy); - free(history); - history = new_ptr; - } - history_max_len = len; - if (history_len > history_max_len) - history_len = history_max_len; - return 1; -} - -/* Save the history in the specified file. On success 0 is returned - * otherwise -1 is returned. */ -int linenoiseHistorySave(const char *filename) { - mode_t old_umask = umask(S_IXUSR|S_IRWXG|S_IRWXO); - File file; - file.open(filename, "w"); - umask(old_umask); - if (file.file == NULL) { - return -1; - } - chmod(filename,S_IRUSR|S_IWUSR); - for (int j = 0; j < history_len; ++j) { - fprintf(file.file, "%s\n", history[j]); - } - - return 0; -} - -/* Load the history from the specified file. If the file does not exist - * zero is returned and no operation is performed. - * - * If the file exists and the operation succeeded 0 is returned, otherwise - * on error -1 is returned. */ -int linenoiseHistoryLoad(const char *filename) { - File file; - file.open(filename, "r"); - char buf[LINENOISE_MAX_LINE]; - if (file.file == NULL) { - return -1; - } - - while (fgets(buf, LINENOISE_MAX_LINE, file.file) != NULL) { - char *p; - - p = strchr(buf,'\r'); - if (!p) p = strchr(buf,'\n'); - if (p) *p = '\0'; - linenoiseHistoryAdd(buf); - } - return 0; -} -#endif diff --git a/examples/run/linenoise.cpp/linenoise.h b/examples/run/linenoise.cpp/linenoise.h deleted file mode 100644 index a14ec6c74..000000000 --- a/examples/run/linenoise.cpp/linenoise.h +++ /dev/null @@ -1,128 +0,0 @@ -/* linenoise.h -- VERSION 1.0 - * - * Guerrilla line editing library against the idea that a line editing lib - * needs to be 20,000 lines of C++ code. - * - * See linenoise.cpp for more information. - * - * ------------------------------------------------------------------------ - * - * Copyright (c) 2010-2023, Salvatore Sanfilippo - * Copyright (c) 2010-2013, Pieter Noordhuis - * Copyright (c) 2025, Eric Curtin - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __LINENOISE_H -#define __LINENOISE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include /* For size_t. */ -#include - -extern const char *linenoiseEditMore; - -/* The linenoiseState structure represents the state during line editing. - * We pass this state to functions implementing specific editing - * functionalities. */ -struct linenoiseState { - int in_completion; /* The user pressed TAB and we are now in completion - * mode, so input is handled by completeLine(). */ - size_t completion_idx; /* Index of next completion to propose. */ - int ifd; /* Terminal stdin file descriptor. */ - int ofd; /* Terminal stdout file descriptor. */ - char *buf; /* Edited line buffer. */ - size_t buflen; /* Edited line buffer size. */ - const char *prompt; /* Prompt to display. */ - size_t plen; /* Prompt length. */ - size_t pos; /* Current cursor position. */ - size_t oldpos; /* Previous refresh cursor position. */ - size_t len; /* Current edited line length. */ - size_t cols; /* Number of columns in terminal. */ - size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */ - int history_index; /* The history index we are currently editing. */ -}; - -struct linenoiseCompletions { - size_t len = 0; - char ** cvec = nullptr; - bool to_free = true; - - ~linenoiseCompletions() { - if (!to_free) { - return; - } - - for (size_t i = 0; i < len; ++i) { - free(cvec[i]); - } - - free(cvec); - } -}; - -/* Non blocking API. */ -int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt); -const char *linenoiseEditFeed(struct linenoiseState *l); -void linenoiseEditStop(struct linenoiseState *l); -void linenoiseHide(struct linenoiseState *l); -void linenoiseShow(struct linenoiseState *l); - -/* Blocking API. */ -const char *linenoise(const char *prompt); -void linenoiseFree(void *ptr); - -/* Completion API. */ -typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *); -typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold); -typedef void(linenoiseFreeHintsCallback)(const char *); -void linenoiseSetCompletionCallback(linenoiseCompletionCallback *); -void linenoiseSetHintsCallback(linenoiseHintsCallback *); -void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *); -void linenoiseAddCompletion(linenoiseCompletions *, const char *); - -/* History API. */ -int linenoiseHistoryAdd(const char *line); -int linenoiseHistorySetMaxLen(int len); -int linenoiseHistorySave(const char *filename); -int linenoiseHistoryLoad(const char *filename); - -/* Other utilities. */ -void linenoiseClearScreen(void); -void linenoiseSetMultiLine(int ml); -void linenoisePrintKeyCodes(void); -void linenoiseMaskModeEnable(void); -void linenoiseMaskModeDisable(void); - -#ifdef __cplusplus -} -#endif - -#endif /* __LINENOISE_H */ diff --git a/examples/run/run.cpp b/examples/run/run.cpp deleted file mode 100644 index 9362da220..000000000 --- a/examples/run/run.cpp +++ /dev/null @@ -1,1173 +0,0 @@ -#if defined(_WIN32) -# include -# include -#else -# include -# include -# include -#endif - -#if defined(LLAMA_USE_CURL) -# include -#endif - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "chat-template.hpp" -#include "common.h" -#include "json.hpp" -#include "linenoise.cpp/linenoise.h" -#include "llama-cpp.h" -#include "log.h" - -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) -[[noreturn]] static void sigint_handler(int) { - printf("\n" LOG_COL_DEFAULT); - exit(0); // not ideal, but it's the only way to guarantee exit in all cases -} -#endif - -GGML_ATTRIBUTE_FORMAT(1, 2) -static std::string fmt(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - const int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::string buf; - buf.resize(size); - const int size2 = vsnprintf(const_cast(buf.data()), buf.size() + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - - return buf; -} - -GGML_ATTRIBUTE_FORMAT(1, 2) -static int printe(const char * fmt, ...) { - va_list args; - va_start(args, fmt); - const int ret = vfprintf(stderr, fmt, args); - va_end(args); - - return ret; -} - -static std::string strftime_fmt(const char * fmt, const std::tm & tm) { - std::ostringstream oss; - oss << std::put_time(&tm, fmt); - - return oss.str(); -} - -class Opt { - public: - int init(int argc, const char ** argv) { - ctx_params = llama_context_default_params(); - model_params = llama_model_default_params(); - context_size_default = ctx_params.n_batch; - ngl_default = model_params.n_gpu_layers; - common_params_sampling sampling; - temperature_default = sampling.temp; - - if (argc < 2) { - printe("Error: No arguments provided.\n"); - print_help(); - return 1; - } - - // Parse arguments - if (parse(argc, argv)) { - printe("Error: Failed to parse arguments.\n"); - print_help(); - return 1; - } - - // If help is requested, show help and exit - if (help) { - print_help(); - return 2; - } - - ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; - ctx_params.n_ctx = ctx_params.n_batch; - model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; - temperature = temperature >= 0 ? temperature : temperature_default; - - return 0; // Success - } - - llama_context_params ctx_params; - llama_model_params model_params; - std::string model_; - std::string user; - bool use_jinja = false; - int context_size = -1, ngl = -1; - float temperature = -1; - bool verbose = false; - - private: - int context_size_default = -1, ngl_default = -1; - float temperature_default = -1; - bool help = false; - - bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) { - return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0; - } - - int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) { - if (i + 1 >= argc) { - return 1; - } - - option_value = std::atoi(argv[++i]); - - return 0; - } - - int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) { - if (i + 1 >= argc) { - return 1; - } - - option_value = std::atof(argv[++i]); - - return 0; - } - - int parse(int argc, const char ** argv) { - bool options_parsing = true; - for (int i = 1, positional_args_i = 0; i < argc; ++i) { - if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { - if (handle_option_with_value(argc, argv, i, context_size) == 1) { - return 1; - } - } else if (options_parsing && - (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { - if (handle_option_with_value(argc, argv, i, ngl) == 1) { - return 1; - } - } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { - if (handle_option_with_value(argc, argv, i, temperature) == 1) { - return 1; - } - } else if (options_parsing && - (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { - verbose = true; - } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { - use_jinja = true; - } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { - help = true; - return 0; - } else if (options_parsing && strcmp(argv[i], "--") == 0) { - options_parsing = false; - } else if (positional_args_i == 0) { - if (!argv[i][0] || argv[i][0] == '-') { - return 1; - } - - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user = argv[i]; - } else { - user += " " + std::string(argv[i]); - } - } - - if (model_.empty()){ - return 1; - } - - return 0; - } - - void print_help() const { - printf( - "Description:\n" - " Runs a llm\n" - "\n" - "Usage:\n" - " llama-run [options] model [prompt]\n" - "\n" - "Options:\n" - " -c, --context-size \n" - " Context size (default: %d)\n" - " -n, -ngl, --ngl \n" - " Number of GPU layers (default: %d)\n" - " --temp \n" - " Temperature (default: %.1f)\n" - " -v, --verbose, --log-verbose\n" - " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" - " -h, --help\n" - " Show help message\n" - "\n" - "Commands:\n" - " model\n" - " Model is a string with an optional prefix of \n" - " huggingface:// (hf://), ollama://, https:// or file://.\n" - " If no protocol is specified and a file exists in the specified\n" - " path, file:// is assumed, otherwise if a file does not exist in\n" - " the specified path, ollama:// is assumed. Models that are being\n" - " pulled are downloaded with .partial extension while being\n" - " downloaded and then renamed as the file without the .partial\n" - " extension when complete.\n" - "\n" - "Examples:\n" - " llama-run llama3\n" - " llama-run ollama://granite-code\n" - " llama-run ollama://smollm:135m\n" - " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n" - " llama-run " - "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" - " llama-run https://example.com/some-file1.gguf\n" - " llama-run some-file2.gguf\n" - " llama-run file://some-file3.gguf\n" - " llama-run --ngl 999 some-file4.gguf\n" - " llama-run --ngl 999 some-file5.gguf Hello World\n", - context_size_default, ngl_default, temperature_default); - } -}; - -struct progress_data { - size_t file_size = 0; - std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); - bool printed = false; -}; - -static int get_terminal_width() { -#if defined(_WIN32) - CONSOLE_SCREEN_BUFFER_INFO csbi; - GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); - return csbi.srWindow.Right - csbi.srWindow.Left + 1; -#else - struct winsize w; - ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); - return w.ws_col; -#endif -} - -#ifdef LLAMA_USE_CURL -class File { - public: - FILE * file = nullptr; - - FILE * open(const std::string & filename, const char * mode) { - file = fopen(filename.c_str(), mode); - - return file; - } - - int lock() { - if (file) { -# ifdef _WIN32 - fd = _fileno(file); - hFile = (HANDLE) _get_osfhandle(fd); - if (hFile == INVALID_HANDLE_VALUE) { - fd = -1; - - return 1; - } - - OVERLAPPED overlapped = {}; - if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD, - &overlapped)) { - fd = -1; - - return 1; - } -# else - fd = fileno(file); - if (flock(fd, LOCK_EX | LOCK_NB) != 0) { - fd = -1; - - return 1; - } -# endif - } - - return 0; - } - - ~File() { - if (fd >= 0) { -# ifdef _WIN32 - if (hFile != INVALID_HANDLE_VALUE) { - OVERLAPPED overlapped = {}; - UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped); - } -# else - flock(fd, LOCK_UN); -# endif - } - - if (file) { - fclose(file); - } - } - - private: - int fd = -1; -# ifdef _WIN32 - HANDLE hFile = nullptr; -# endif -}; - -class HttpClient { - public: - int init(const std::string & url, const std::vector & headers, const std::string & output_file, - const bool progress, std::string * response_str = nullptr) { - if (std::filesystem::exists(output_file)) { - return 0; - } - - std::string output_file_partial; - curl = curl_easy_init(); - if (!curl) { - return 1; - } - - progress_data data; - File out; - if (!output_file.empty()) { - output_file_partial = output_file + ".partial"; - if (!out.open(output_file_partial, "ab")) { - printe("Failed to open file for writing\n"); - - return 1; - } - - if (out.lock()) { - printe("Failed to exclusively lock file\n"); - - return 1; - } - } - - set_write_options(response_str, out); - data.file_size = set_resume_point(output_file_partial); - set_progress_options(progress, data); - set_headers(headers); - CURLcode res = perform(url); - if (res != CURLE_OK){ - printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res)); - return 1; - } - if (!output_file.empty()) { - std::filesystem::rename(output_file_partial, output_file); - } - - return 0; - } - - ~HttpClient() { - if (chunk) { - curl_slist_free_all(chunk); - } - - if (curl) { - curl_easy_cleanup(curl); - } - } - - private: - CURL * curl = nullptr; - struct curl_slist * chunk = nullptr; - - void set_write_options(std::string * response_str, const File & out) { - if (response_str) { - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str); - } else { - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file); - } - } - - size_t set_resume_point(const std::string & output_file) { - size_t file_size = 0; - if (std::filesystem::exists(output_file)) { - file_size = std::filesystem::file_size(output_file); - curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast(file_size)); - } - - return file_size; - } - - void set_progress_options(bool progress, progress_data & data) { - if (progress) { - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); - curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data); - curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress); - } - } - - void set_headers(const std::vector & headers) { - if (!headers.empty()) { - if (chunk) { - curl_slist_free_all(chunk); - chunk = 0; - } - - for (const auto & header : headers) { - chunk = curl_slist_append(chunk, header.c_str()); - } - - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); - } - } - - CURLcode perform(const std::string & url) { - curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); - return curl_easy_perform(curl); - } - - static std::string human_readable_time(double seconds) { - int hrs = static_cast(seconds) / 3600; - int mins = (static_cast(seconds) % 3600) / 60; - int secs = static_cast(seconds) % 60; - - if (hrs > 0) { - return fmt("%dh %02dm %02ds", hrs, mins, secs); - } else if (mins > 0) { - return fmt("%dm %02ds", mins, secs); - } else { - return fmt("%ds", secs); - } - } - - static std::string human_readable_size(curl_off_t size) { - static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" }; - char length = sizeof(suffix) / sizeof(suffix[0]); - int i = 0; - double dbl_size = size; - if (size > 1024) { - for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) { - dbl_size = size / 1024.0; - } - } - - return fmt("%.2f %s", dbl_size, suffix[i]); - } - - static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, - curl_off_t) { - progress_data * data = static_cast(ptr); - if (total_to_download <= 0) { - return 0; - } - - total_to_download += data->file_size; - const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size; - const curl_off_t percentage = calculate_percentage(now_downloaded_plus_file_size, total_to_download); - std::string progress_prefix = generate_progress_prefix(percentage); - - const double speed = calculate_speed(now_downloaded, data->start_time); - const double tim = (total_to_download - now_downloaded) / speed; - std::string progress_suffix = - generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim); - - int progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix); - std::string progress_bar; - generate_progress_bar(progress_bar_width, percentage, progress_bar); - - print_progress(progress_prefix, progress_bar, progress_suffix); - data->printed = true; - - return 0; - } - - static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) { - return (now_downloaded_plus_file_size * 100) / total_to_download; - } - - static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast(percentage)); } - - static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { - const auto now = std::chrono::steady_clock::now(); - const std::chrono::duration elapsed_seconds = now - start_time; - return now_downloaded / elapsed_seconds.count(); - } - - static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download, - double speed, double estimated_time) { - const int width = 10; - return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width, - human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width, - human_readable_time(estimated_time).c_str()); - } - - static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) { - int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3; - if (progress_bar_width < 1) { - progress_bar_width = 1; - } - - return progress_bar_width; - } - - static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage, - std::string & progress_bar) { - const curl_off_t pos = (percentage * progress_bar_width) / 100; - for (int i = 0; i < progress_bar_width; ++i) { - progress_bar.append((i < pos) ? "█" : " "); - } - - return progress_bar; - } - - static void print_progress(const std::string & progress_prefix, const std::string & progress_bar, - const std::string & progress_suffix) { - printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str()); - } - // Function to write data to a file - static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { - FILE * out = static_cast(stream); - return fwrite(ptr, size, nmemb, out); - } - - // Function to capture data into a string - static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) { - std::string * str = static_cast(stream); - str->append(static_cast(ptr), size * nmemb); - return size * nmemb; - } -}; -#endif - -class LlamaData { - public: - llama_model_ptr model; - llama_sampler_ptr sampler; - llama_context_ptr context; - std::vector messages; - std::list msg_strs; - std::vector fmtted; - - int init(Opt & opt) { - model = initialize_model(opt); - if (!model) { - return 1; - } - - context = initialize_context(model, opt); - if (!context) { - return 1; - } - - sampler = initialize_sampler(opt); - - return 0; - } - - private: -#ifdef LLAMA_USE_CURL - int download(const std::string & url, const std::string & output_file, const bool progress, - const std::vector & headers = {}, std::string * response_str = nullptr) { - HttpClient http; - if (http.init(url, headers, output_file, progress, response_str)) { - return 1; - } - - return 0; - } -#else - int download(const std::string &, const std::string &, const bool, const std::vector & = {}, - std::string * = nullptr) { - printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); - - return 1; - } -#endif - - // Helper function to handle model tag extraction and URL construction - std::pair extract_model_and_tag(std::string & model, const std::string & base_url) { - std::string model_tag = "latest"; - const size_t colon_pos = model.find(':'); - if (colon_pos != std::string::npos) { - model_tag = model.substr(colon_pos + 1); - model = model.substr(0, colon_pos); - } - - std::string url = base_url + model + "/manifests/" + model_tag; - - return { model, url }; - } - - // Helper function to download and parse the manifest - int download_and_parse_manifest(const std::string & url, const std::vector & headers, - nlohmann::json & manifest) { - std::string manifest_str; - int ret = download(url, "", false, headers, &manifest_str); - if (ret) { - return ret; - } - - manifest = nlohmann::json::parse(manifest_str); - - return 0; - } - - int huggingface_dl(std::string & model, const std::string & bn) { - // Find the second occurrence of '/' after protocol string - size_t pos = model.find('/'); - pos = model.find('/', pos + 1); - std::string hfr, hff; - std::vector headers = { "User-Agent: llama-cpp", "Accept: application/json" }; - std::string url; - - if (pos == std::string::npos) { - auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/"); - hfr = model_name; - - nlohmann::json manifest; - int ret = download_and_parse_manifest(manifest_url, headers, manifest); - if (ret) { - return ret; - } - - hff = manifest["ggufFile"]["rfilename"]; - } else { - hfr = model.substr(0, pos); - hff = model.substr(pos + 1); - } - - url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; - - return download(url, bn, true, headers); - } - - int ollama_dl(std::string & model, const std::string & bn) { - const std::vector headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" }; - if (model.find('/') == std::string::npos) { - model = "library/" + model; - } - - auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/"); - nlohmann::json manifest; - int ret = download_and_parse_manifest(manifest_url, {}, manifest); - if (ret) { - return ret; - } - - std::string layer; - for (const auto & l : manifest["layers"]) { - if (l["mediaType"] == "application/vnd.ollama.image.model") { - layer = l["digest"]; - break; - } - } - - std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer; - - return download(blob_url, bn, true, headers); - } - - int github_dl(const std::string & model, const std::string & bn) { - std::string repository = model; - std::string branch = "main"; - const size_t at_pos = model.find('@'); - if (at_pos != std::string::npos) { - repository = model.substr(0, at_pos); - branch = model.substr(at_pos + 1); - } - - const std::vector repo_parts = string_split(repository, "/"); - if (repo_parts.size() < 3) { - printe("Invalid GitHub repository format\n"); - return 1; - } - - const std::string & org = repo_parts[0]; - const std::string & project = repo_parts[1]; - std::string url = "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch; - for (size_t i = 2; i < repo_parts.size(); ++i) { - url += "/" + repo_parts[i]; - } - - return download(url, bn, true); - } - - int s3_dl(const std::string & model, const std::string & bn) { - const size_t slash_pos = model.find('/'); - if (slash_pos == std::string::npos) { - return 1; - } - - const std::string bucket = model.substr(0, slash_pos); - const std::string key = model.substr(slash_pos + 1); - const char * access_key = std::getenv("AWS_ACCESS_KEY_ID"); - const char * secret_key = std::getenv("AWS_SECRET_ACCESS_KEY"); - if (!access_key || !secret_key) { - printe("AWS credentials not found in environment\n"); - return 1; - } - - // Generate AWS Signature Version 4 headers - // (Implementation requires HMAC-SHA256 and date handling) - // Get current timestamp - const time_t now = time(nullptr); - const tm tm = *gmtime(&now); - const std::string date = strftime_fmt("%Y%m%d", tm); - const std::string datetime = strftime_fmt("%Y%m%dT%H%M%SZ", tm); - const std::vector headers = { - "Authorization: AWS4-HMAC-SHA256 Credential=" + std::string(access_key) + "/" + date + - "/us-east-1/s3/aws4_request", - "x-amz-content-sha256: UNSIGNED-PAYLOAD", "x-amz-date: " + datetime - }; - - const std::string url = "https://" + bucket + ".s3.amazonaws.com/" + key; - - return download(url, bn, true, headers); - } - - std::string basename(const std::string & path) { - const size_t pos = path.find_last_of("/\\"); - if (pos == std::string::npos) { - return path; - } - - return path.substr(pos + 1); - } - - int rm_until_substring(std::string & model_, const std::string & substring) { - const std::string::size_type pos = model_.find(substring); - if (pos == std::string::npos) { - return 1; - } - - model_ = model_.substr(pos + substring.size()); // Skip past the substring - return 0; - } - - int resolve_model(std::string & model_) { - int ret = 0; - if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) { - rm_until_substring(model_, "://"); - - return ret; - } - - const std::string bn = basename(model_); - if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://") || - string_starts_with(model_, "hf.co/")) { - rm_until_substring(model_, "hf.co/"); - rm_until_substring(model_, "://"); - ret = huggingface_dl(model_, bn); - } else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) && - !string_starts_with(model_, "https://ollama.com/library/")) { - ret = download(model_, bn, true); - } else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) { - rm_until_substring(model_, "github:"); - rm_until_substring(model_, "://"); - ret = github_dl(model_, bn); - } else if (string_starts_with(model_, "s3://")) { - rm_until_substring(model_, "://"); - ret = s3_dl(model_, bn); - } else { // ollama:// or nothing - rm_until_substring(model_, "ollama.com/library/"); - rm_until_substring(model_, "://"); - ret = ollama_dl(model_, bn); - } - - model_ = bn; - - return ret; - } - - // Initializes the model and returns a unique pointer to it - llama_model_ptr initialize_model(Opt & opt) { - ggml_backend_load_all(); - resolve_model(opt.model_); - printe("\r" LOG_CLR_TO_EOL "Loading model"); - llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params)); - if (!model) { - printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); - } - - printe("\r" LOG_CLR_TO_EOL); - return model; - } - - // Initializes the context with the specified parameters - llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) { - llama_context_ptr context(llama_init_from_model(model.get(), opt.ctx_params)); - if (!context) { - printe("%s: error: failed to create the llama_context\n", __func__); - } - - return context; - } - - // Initializes and configures the sampler - llama_sampler_ptr initialize_sampler(const Opt & opt) { - llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); - llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1)); - llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature)); - llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); - - return sampler; - } -}; - -// Add a message to `messages` and store its content in `msg_strs` -static void add_message(const char * role, const std::string & text, LlamaData & llama_data) { - llama_data.msg_strs.push_back(std::move(text)); - llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() }); -} - -// Function to apply the chat template and resize `formatted` if needed -static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) { - if (use_jinja) { - json messages = json::array(); - for (const auto & msg : llama_data.messages) { - messages.push_back({ - {"role", msg.role}, - {"content", msg.content}, - }); - } - try { - minja::chat_template_inputs tmpl_inputs; - tmpl_inputs.messages = messages; - tmpl_inputs.add_generation_prompt = append; - - minja::chat_template_options tmpl_opts; - tmpl_opts.use_bos_token = false; - tmpl_opts.use_eos_token = false; - - auto result = tmpl.apply(tmpl_inputs, tmpl_opts); - llama_data.fmtted.resize(result.size() + 1); - memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1); - return result.size(); - } catch (const std::exception & e) { - printe("failed to render the chat template: %s\n", e.what()); - return -1; - } - } - int result = llama_chat_apply_template( - tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append, - append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); - if (append && result > static_cast(llama_data.fmtted.size())) { - llama_data.fmtted.resize(result); - result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(), - llama_data.messages.size(), append, llama_data.fmtted.data(), - llama_data.fmtted.size()); - } - - return result; -} - -// Function to tokenize the prompt -static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, - std::vector & prompt_tokens, const LlamaData & llama_data) { - const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0; - - const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); - prompt_tokens.resize(n_prompt_tokens); - if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, - true) < 0) { - printe("failed to tokenize the prompt\n"); - return -1; - } - - return n_prompt_tokens; -} - -// Check if we have enough space in the context to evaluate this batch -static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { - const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); - if (n_ctx_used + batch.n_tokens > n_ctx) { - printf(LOG_COL_DEFAULT "\n"); - printe("context size exceeded\n"); - return 1; - } - - return 0; -} - -// convert the token to a string -static int convert_token_to_string(const llama_vocab * vocab, const llama_token token_id, std::string & piece) { - char buf[256]; - int n = llama_token_to_piece(vocab, token_id, buf, sizeof(buf), 0, true); - if (n < 0) { - printe("failed to convert token to piece\n"); - return 1; - } - - piece = std::string(buf, n); - return 0; -} - -static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) { - printf("%s", piece.c_str()); - fflush(stdout); - response += piece; -} - -// helper function to evaluate a prompt and generate a response -static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { - const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get()); - - std::vector tokens; - if (tokenize_prompt(vocab, prompt, tokens, llama_data) < 0) { - return 1; - } - - // prepare a batch for the prompt - llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); - llama_token new_token_id; - while (true) { - check_context_size(llama_data.context, batch); - if (llama_decode(llama_data.context.get(), batch)) { - printe("failed to decode\n"); - return 1; - } - - // sample the next token, check is it an end of generation? - new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1); - if (llama_vocab_is_eog(vocab, new_token_id)) { - break; - } - - std::string piece; - if (convert_token_to_string(vocab, new_token_id, piece)) { - return 1; - } - - print_word_and_concatenate_to_response(piece, response); - - // prepare the next batch with the sampled token - batch = llama_batch_get_one(&new_token_id, 1); - } - - printf(LOG_COL_DEFAULT); - return 0; -} - -static int read_user_input(std::string & user_input) { - static const char * prompt_prefix = "> "; -#ifdef WIN32 - printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix); - - std::getline(std::cin, user_input); - if (std::cin.eof()) { - printf("\n"); - return 1; - } -#else - std::unique_ptr line(const_cast(linenoise(prompt_prefix)), free); - if (!line) { - return 1; - } - - user_input = line.get(); -#endif - - if (user_input == "/bye") { - return 1; - } - - if (user_input.empty()) { - return 2; - } - -#ifndef WIN32 - linenoiseHistoryAdd(line.get()); -#endif - - return 0; // Should have data in happy path -} - -// Function to generate a response based on the prompt -static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response, - const bool stdout_a_terminal) { - // Set response color - if (stdout_a_terminal) { - printf(LOG_COL_YELLOW); - } - - if (generate(llama_data, prompt, response)) { - printe("failed to generate response\n"); - return 1; - } - - // End response with color reset and newline - printf("\n%s", stdout_a_terminal ? LOG_COL_DEFAULT : ""); - return 0; -} - -// Helper function to apply the chat template and handle errors -static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) { - const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja); - if (new_len < 0) { - printe("failed to apply the chat template\n"); - return -1; - } - - output_length = new_len; - return 0; -} - -// Helper function to handle user input -static int handle_user_input(std::string & user_input, const std::string & user) { - if (!user.empty()) { - user_input = user; - return 0; // No need for interactive input - } - - return read_user_input(user_input); // Returns true if input ends the loop -} - -static bool is_stdin_a_terminal() { -#if defined(_WIN32) - HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; - return GetConsoleMode(hStdin, &mode); -#else - return isatty(STDIN_FILENO); -#endif -} - -static bool is_stdout_a_terminal() { -#if defined(_WIN32) - HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); - DWORD mode; - return GetConsoleMode(hStdout, &mode); -#else - return isatty(STDOUT_FILENO); -#endif -} - -// Function to handle user input -static int get_user_input(std::string & user_input, const std::string & user) { - while (true) { - const int ret = handle_user_input(user_input, user); - if (ret == 1) { - return 1; - } - - if (ret == 2) { - continue; - } - - break; - } - - return 0; -} - -// Main chat loop function -static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) { - int prev_len = 0; - llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); - auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), ""); - GGML_ASSERT(chat_templates.template_default); - static const bool stdout_a_terminal = is_stdout_a_terminal(); - while (true) { - // Get user input - std::string user_input; - if (get_user_input(user_input, user) == 1) { - return 0; - } - - add_message("user", user.empty() ? user_input : user, llama_data); - int new_len; - if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) { - return 1; - } - - std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); - std::string response; - if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { - return 1; - } - - if (!user.empty()) { - break; - } - - add_message("assistant", response, llama_data); - if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) { - return 1; - } - } - - return 0; -} - -static void log_callback(const enum ggml_log_level level, const char * text, void * p) { - const Opt * opt = static_cast(p); - if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) { - printe("%s", text); - } -} - -static std::string read_pipe_data() { - std::ostringstream result; - result << std::cin.rdbuf(); // Read all data from std::cin - return result.str(); -} - -static void ctrl_c_handling() { -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = sigint_handler; - sigemptyset(&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -#elif defined(_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); -#endif -} - -int main(int argc, const char ** argv) { - ctrl_c_handling(); - Opt opt; - const int ret = opt.init(argc, argv); - if (ret == 2) { - return 0; - } else if (ret) { - return 1; - } - - if (!is_stdin_a_terminal()) { - if (!opt.user.empty()) { - opt.user += "\n\n"; - } - - opt.user += read_pipe_data(); - } - - llama_log_set(log_callback, &opt); - LlamaData llama_data; - if (llama_data.init(opt)) { - return 1; - } - - if (chat_loop(llama_data, opt.user, opt.use_jinja)) { - return 1; - } - - return 0; -} diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index 0f50e50de..0fb5e359b 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-save-load-state) add_executable(${TARGET} save-load-state.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index cf7cbd815..d8afdc141 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,17 +1,17 @@ -#include "arg.h" #include "common.h" #include "llama.h" #include #include +#include int main(int argc, char ** argv) { - common_params params; + gpt_params params; params.prompt = "The quick brown fox"; - params.sampling.seed = 1234; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } @@ -28,35 +28,21 @@ int main(int argc, char ** argv) { std::string result2; // init - common_init_result llama_init = common_init_from_params(params); - - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; } - auto sparams = llama_sampler_chain_default_params(); - - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); - // tokenize prompt - auto tokens = common_tokenize(ctx, params.prompt, true); - - // prepare the batch - llama_batch batch = llama_batch_init(tokens.size(), 0, 1); - for (size_t i = 0; i < tokens.size(); i++) { - common_batch_add(batch, tokens[i], i, {0}, false); - } - batch.logits[batch.n_tokens - 1] = true; // generate next token + auto tokens = llama_tokenize(ctx, params.prompt, true); // evaluate prompt - llama_decode(ctx, batch); - n_past += batch.n_tokens; + llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); + n_past += tokens.size(); // save state (rng, logits, embedding and kv_cache) to file { @@ -77,18 +63,25 @@ int main(int argc, char ** argv) { printf("\nfirst run: %s", params.prompt.c_str()); for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = common_token_to_piece(ctx, next_token); + auto * logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(model); + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + auto next_token = llama_sample_token(ctx, &candidates_p); + auto next_token_str = llama_token_to_piece(ctx, next_token); printf("%s", next_token_str.c_str()); result0 += next_token_str; - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); - - if (llama_decode(ctx, batch)) { + if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); + llama_free(ctx); + llama_free_model(model); return 1; } n_past += 1; @@ -96,12 +89,11 @@ int main(int argc, char ** argv) { printf("\n\n"); + // free old context + llama_free(ctx); + // make new context - llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params)); - - llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); + auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); printf("\nsecond run: %s", params.prompt.c_str()); @@ -118,6 +110,8 @@ int main(int argc, char ** argv) { if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } @@ -129,18 +123,24 @@ int main(int argc, char ** argv) { // second run for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = common_token_to_piece(ctx2, next_token); + auto * logits = llama_get_logits(ctx2); + auto n_vocab = llama_n_vocab(model); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + auto next_token = llama_sample_token(ctx2, &candidates_p); + auto next_token_str = llama_token_to_piece(ctx2, next_token); printf("%s", next_token_str.c_str()); result1 += next_token_str; - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); - - if (llama_decode(ctx2, batch)) { + if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); + llama_free(ctx2); + llama_free_model(model); return 1; } n_past += 1; @@ -148,17 +148,15 @@ int main(int argc, char ** argv) { printf("\n\n"); + llama_free(ctx2); + if (result0 != result1) { fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); return 1; } // make new context - llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); - - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); + auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); printf("\nsingle seq run: %s", params.prompt.c_str()); @@ -175,6 +173,8 @@ int main(int argc, char ** argv) { if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx3); + llama_free_model(model); return 1; } @@ -191,6 +191,8 @@ int main(int argc, char ** argv) { const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); if (ncopy != seq_store.size()) { fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + llama_free(ctx3); + llama_free_model(model); return 1; } fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); @@ -203,6 +205,8 @@ int main(int argc, char ** argv) { const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); if (nset != seq_store.size()) { fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + llama_free(ctx3); + llama_free_model(model); return 1; } fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); @@ -210,18 +214,24 @@ int main(int argc, char ** argv) { // third run with seq 1 instead of 0 for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = common_token_to_piece(ctx3, next_token); + auto * logits = llama_get_logits(ctx3); + auto n_vocab = llama_n_vocab(model); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + auto next_token = llama_sample_token(ctx3, &candidates_p); + auto next_token_str = llama_token_to_piece(ctx3, next_token); printf("%s", next_token_str.c_str()); result2 += next_token_str; - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); - - if (llama_decode(ctx3, batch)) { + if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); + llama_free(ctx3); + llama_free_model(model); return 1; } n_past += 1; @@ -229,11 +239,8 @@ int main(int argc, char ** argv) { printf("\n"); - llama_sampler_free(smpl); - llama_sampler_free(smpl2); - llama_sampler_free(smpl3); - - llama_batch_free(batch); + llama_free(ctx3); + llama_free_model(model); if (result0 != result2) { fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 1b7cc8c13..dbe41f1fd 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-server) - -option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) +option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) @@ -15,8 +15,21 @@ set(TARGET_SRCS httplib.h ) set(PUBLIC_ASSETS - index.html.gz - loading.html + colorthemes.css + style.css + theme-beeninorder.css + theme-ketivah.css + theme-mangotango.css + theme-playground.css + theme-polarnight.css + theme-snowstorm.css + index.html + index-new.html + index.js + completion.js + system-prompts.js + prompt-formats.js + json-schema-to-grammar.mjs ) foreach(asset ${PUBLIC_ASSETS}) @@ -28,13 +41,14 @@ foreach(asset ${PUBLIC_ASSETS}) OUTPUT "${output}" COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" ) - set_source_files_properties(${output} PROPERTIES GENERATED TRUE) endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) -target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) @@ -47,4 +61,4 @@ if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() -target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/server/README.md b/examples/server/README.md index d0b262f0e..ff4074517 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,9 +5,8 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** - * LLM inference of F16 and quantized models on GPU and CPU + * LLM inference of F16 and quantum models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes - * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510) * Parallel decoding with multi-user support * Continuous batching * Multimodal (wip) @@ -18,183 +17,290 @@ The project is under active development, and we are [looking for feedback and co ## Usage - - -**Common params** - -| Argument | Explanation | -| -------- | ----------- | -| `-h, --help, --usage` | print usage and exit | -| `--version` | show version and build info | -| `--verbose-prompt` | print a verbose prompt before generation (default: false) | -| `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | -| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | -| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | -| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | -| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)
| -| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| -| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)
| -| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | -| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | -| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | -| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| -| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | -| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | -| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | -| `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | -| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | -| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | -| `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | -| `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | -| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | -| `--no-escape` | do not process escape sequences | -| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | -| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N
(env: LLAMA_ARG_ROPE_SCALE) | -| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | -| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | -| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | -| `-dkvc, --dump-kv-cache` | verbose print of the KV cache | -| `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | -| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | -| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | -| `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | -| `--list-devices` | print list of available devices and exit | -| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | -| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | -| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | -| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | -| `--check-tensors` | check model tensor data for invalid values (default: false) | -| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | -| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | -| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | -| `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | -| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | -| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | -| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | -| `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | -| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | -| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | -| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | -| `--log-disable` | Log disable | -| `--log-file FNAME` | Log to file | -| `--log-colors` | Enable colored logging
(env: LLAMA_LOG_COLORS) | -| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | -| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.
(env: LLAMA_LOG_VERBOSITY) | -| `--log-prefix` | Enable prefx in log messages
(env: LLAMA_LOG_PREFIX) | -| `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | - - -**Sampling params** - -| Argument | Explanation | -| -------- | ----------- | -| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) | -| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | -| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) | -| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | -| `--top-k N` | top-k sampling (default: 40, 0 = disabled) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | -| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | -| `--dry-base N` | set DRY sampling base value (default: 1.75) | -| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | -| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | -| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers
| -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | -| `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | -| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | -| `--grammar-file FNAME` | file to read grammar from | -| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | -| `--jinja` | Enable experimental Jinja templating engine (required for tool use) | - -**Example-specific params** - -| Argument | Explanation | -| -------- | ----------- | -| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | -| `-sp, --special` | special tokens output enabled (default: false) | -| `--no-warmup` | skip warming up the model with an empty run | -| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | -| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | -| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | -| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | -| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | -| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | -| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | -| `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | -| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) | -| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | -| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | -| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | -| `--api-key-file FNAME` | path to file containing API keys (default: none) | -| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | -| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | -| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | -| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | -| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) | -| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | -| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | -| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | -| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | -| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| -| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | -| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) | -| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | -| `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | -| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | -| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | - - -Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. - -Example usage of docker compose with environment variables: - -```yml -services: - llamacpp-server: - image: ghcr.io/ggerganov/llama.cpp:server - ports: - - 8080:8080 - volumes: - - ./models:/models - environment: - # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model - LLAMA_ARG_MODEL: /models/my_model.gguf - LLAMA_ARG_CTX_SIZE: 4096 - LLAMA_ARG_N_PARALLEL: 2 - LLAMA_ARG_ENDPOINT_METRICS: 1 - LLAMA_ARG_PORT: 8080 ``` +usage: ./llama-server [options] + +general: + + -h, --help, --usage print usage and exit + --version show version and build info + -v, --verbose print verbose information + --verbosity N set specific verbosity level (default: 0) + --verbose-prompt print a verbose prompt before generation (default: false) + --no-display-prompt don't print prompt at generation (default: false) + -co, --color colorise output to distinguish prompt and user input from generations (default: false) + -s, --seed SEED RNG seed (default: -1, use random seed for < 0) + -t, --threads N number of threads to use during generation (default: 8) + -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) + -td, --threads-draft N number of threads to use during generation (default: same as --threads) + -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) + --draft N number of tokens to draft for speculative decoding (default: 5) + -ps, --p-split N speculative decoding split probability (default: 0.1) + -lcs, --lookup-cache-static FNAME + path to static lookup cache to use for lookup decoding (not updated by generation) + -lcd, --lookup-cache-dynamic FNAME + path to dynamic lookup cache to use for lookup decoding (updated by generation) + -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) + -b, --batch-size N logical maximum batch size (default: 2048) + -ub, --ubatch-size N physical maximum batch size (default: 512) + --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all) + --chunks N max number of chunks to process (default: -1, -1 = all) + -fa, --flash-attn enable Flash Attention (default: disabled) + -p, --prompt PROMPT prompt to start generation with + in conversation mode, this will be used as system prompt + (default: '') + -f, --file FNAME a file containing the prompt (default: none) + --in-file FNAME an input file (repeat to specify multiple files) + -bf, --binary-file FNAME binary file containing the prompt (default: none) + -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) + --no-escape do not process escape sequences + -ptc, --print-token-count N print token count every N tokens (default: -1) + --prompt-cache FNAME file to cache prompt state for faster startup (default: none) + --prompt-cache-all if specified, saves user input and generations to cache as well + not supported with --interactive or other interactive options + --prompt-cache-ro if specified, uses the prompt cache but does not update it + -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode + can be specified more than once for multiple prompts + -sp, --special special tokens output enabled (default: false) + -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix + if suffix/prefix are not specified, default chat template will be used + (default: false) + -i, --interactive run in interactive mode (default: false) + -if, --interactive-first run in interactive mode and wait for input right away (default: false) + -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\' + --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string + --in-prefix STRING string to prefix user inputs with (default: empty) + --in-suffix STRING string to suffix after user inputs with (default: empty) + --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) + +sampling: + + --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';' + (default: top_k;tfs_z;typical_p;top_p;min_p;temperature) + --sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt) + --ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf) + --penalize-nl penalize newline tokens (default: false) + --temp N temperature (default: 0.8) + --top-k N top-k sampling (default: 40, 0 = disabled) + --top-p N top-p sampling (default: 0.9, 1.0 = disabled) + --min-p N min-p sampling (default: 0.1, 0.0 = disabled) + --tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled) + --typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) + --repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) + --repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) + --presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) + --frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) + --dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) + --dynatemp-exp N dynamic temperature exponent (default: 1.0) + --mirostat N use Mirostat sampling. + Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. + (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) + --mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) + --mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) + -l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, + i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', + or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' + --cfg-negative-prompt PROMPT + negative prompt to use for guidance (default: '') + --cfg-negative-prompt-file FNAME + negative prompt file to use for guidance + --cfg-scale N strength of guidance (default: 1.0, 1.0 = disable) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + +grammar: + + --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') + --grammar-file FNAME file to read grammar from + -j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object + For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead + +embedding: + + --pooling {none,mean,cls,last} + pooling type for embeddings, use model default if unspecified + --attention {causal,non-causal} + attention type for embeddings, use model default if unspecified + +context hacking: + + --rope-scaling {none,linear,yarn} + RoPE frequency scaling method, defaults to linear unless specified by the model + --rope-scale N RoPE context scaling factor, expands context by a factor of N + --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model) + --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N + --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size) + --yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) + --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0) + --yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0) + --yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0) + -gan, --grp-attn-n N group-attention factor (default: 1) + -gaw, --grp-attn-w N group-attention width (default: 512.0) + -dkvc, --dump-kv-cache verbose print of the KV cache + -nkvo, --no-kv-offload disable KV offload + -ctk, --cache-type-k TYPE KV cache data type for K (default: f16) + -ctv, --cache-type-v TYPE KV cache data type for V (default: f16) + +perplexity: + + --all-logits return logits for all tokens in the batch (default: false) + --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f + --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400) + --winogrande compute Winogrande score over random tasks from datafile supplied with -f + --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0) + --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f + --multiple-choice-tasks N + number of tasks to use when computing the multiple choice score (default: 0) + --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base + --ppl-stride N stride for perplexity calculation (default: 0) + --ppl-output-type {0,1} output type for perplexity calculation (default: 0) + +parallel: + + -dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled) + -np, --parallel N number of parallel sequences to decode (default: 1) + -ns, --sequences N number of sequences to decode (default: 1) + -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) + +multi-modality: + + --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md + --image FILE path to an image file. use with multimodal models. Specify multiple times for batching + +backend: + + --rpc SERVERS comma separated list of RPC servers + --mlock force system to keep model in RAM rather than swapping or compressing + --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock) + --numa TYPE attempt optimizations that help on some NUMA systems + - distribute: spread execution evenly over all nodes + - isolate: only spawn threads on CPUs on the node that execution started on + - numactl: use the CPU map provided by numactl + if run without this previously, it is recommended to drop the system page cache before using this + see https://github.com/ggerganov/llama.cpp/issues/1437 + +model: + + --check-tensors check model tensor data for invalid values (default: false) + --override-kv KEY=TYPE:VALUE + advanced option to override model metadata by key. may be specified multiple times. + types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + --control-vector FNAME add a control vector + note: this argument can be repeated to add multiple control vectors + --control-vector-scaled FNAME SCALE + add a control vector with user defined scaling SCALE + note: this argument can be repeated to add multiple scaled control vectors + --control-vector-layer-range START END + layer range to apply the control vector(s) to, start and end inclusive + -m, --model FNAME model path (default: models/$filename with filename from --hf-file + or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) + -md, --model-draft FNAME draft model for speculative decoding (default: unused) + -mu, --model-url MODEL_URL model download url (default: unused) + -hfr, --hf-repo REPO Hugging Face model repository (default: unused) + -hff, --hf-file FILE Hugging Face model file (default: unused) + -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) + +retrieval: + + --context-file FNAME file to load context from (repeat to specify multiple files) + --chunk-size N minimum length of embedded text chunks (default: 64) + --chunk-separator STRING + separator between chunks (default: ' + ') + +passkey: + + --junk N number of times to repeat the junk text (default: 250) + --pos N position of the passkey in the junk text (default: -1) + +imatrix: + + -o, --output FNAME output file (default: 'imatrix.dat') + --output-frequency N output the imatrix every N iterations (default: 10) + --save-frequency N save an imatrix copy every N iterations (default: 0) + --process-output collect data for the output tensor (default: false) + --no-ppl do not compute perplexity (default: true) + --chunk N start processing the input from chunk N (default: 0) + +bench: + + -pps is the prompt shared across parallel sequences (default: false) + -npp n0,n1,... number of prompt tokens + -ntg n0,n1,... number of text generation tokens + -npl n0,n1,... number of parallel prompts + +embedding: + + --embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + --embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix + --embd-separator separator of embendings (default \n) for example "<#sep#>" + +server: + + --host HOST ip address to listen (default: 127.0.0.1) + --port PORT port to listen (default: 8080) + --path PATH path to serve static files from (default: ) + --embedding(s) enable embedding endpoint (default: disabled) + --api-key KEY API key to use for authentication (default: none) + --api-key-file FNAME path to file containing API keys (default: none) + --ssl-key-file FNAME path to file a PEM-encoded SSL private key + --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate + --timeout N server read/write timeout in seconds (default: 600) + --threads-http N number of threads used to process HTTP requests (default: -1) + --system-prompt-file FNAME + set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications + --log-format {text,json} + log output format: json or text (default: json) + --metrics enable prometheus compatible metrics endpoint (default: disabled) + --no-slots disables slots monitoring endpoint (default: enabled) + --slot-save-path PATH path to save slot kv cache (default: disabled) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + -sps, --slot-prompt-similarity SIMILARITY + how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) + + +logging: + + --simple-io use basic IO for better compatibility in subprocesses and limited consoles + -ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset) + --log-test Run simple logging test + --log-disable Disable trace logs + --log-enable Enable trace logs + --log-file FNAME Specify a log filename (without extension) + --log-new Create a separate new log file on start. Each log file will have unique name: "..log" + --log-append Don't truncate the old log file. + +cvector: + + -o, --output FNAME output file (default: 'control_vector.gguf') + --positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt') + --negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt') + --pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100) + --pca-iter N number of iterations used for PCA (default: 1000) + --method {pca,mean} dimensionality reduction method to be used (default: pca) +``` + ## Build `llama-server` is built alongside everything else from the root of the project +- Using `make`: + + ```bash + make llama-server + ``` + - Using `CMake`: ```bash @@ -208,6 +314,15 @@ services: `llama-server` can also be built with SSL support using OpenSSL 3 +- Using `make`: + + ```bash + # NOTE: For non-system openssl, use the following: + # CXXFLAGS="-I /path/to/openssl/include" + # LDFLAGS="-L /path/to/openssl/lib" + make LLAMA_SERVER_SSL=true llama-server + ``` + - Using `CMake`: ```bash @@ -215,41 +330,6 @@ services: cmake --build build --config Release -t llama-server ``` -## Web UI - -The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint. - -The web UI is developed using: -- `react` framework for frontend development -- `tailwindcss` and `daisyui` for styling -- `vite` for build tooling - -A pre-built version is available as a single HTML file under `/public` directory. - -To build or to run the dev server (with hot reload): - -```sh -# make sure you have nodejs installed -cd examples/server/webui -npm i - -# to run the dev server -npm run dev - -# to build the public/index.html.gz -npm run build -``` -After `public/index.html.gz` has been generated we need to generate the c++ -headers (like build/examples/server/index.html.gz.hpp) that will be included -by server.cpp. This is done by building `llama-server` as described in the -[build](#build) section above. - -NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console: - -```js -localStorage.setItem('base', 'http://localhost:8080') -``` - ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: @@ -304,23 +384,23 @@ mkdir llama-client cd llama-client ``` -Create an index.js file and put this inside: +Create a index.js file and put this inside: ```javascript -const prompt = "Building a website can be done in 10 simple steps:" +const prompt = `Building a website can be done in 10 simple steps:`; -async function test() { +async function Test() { let response = await fetch("http://127.0.0.1:8080/completion", { - method: "POST", + method: 'POST', body: JSON.stringify({ prompt, - n_predict: 64, + n_predict: 512, }) }) console.log((await response.json()).content) } -test() +Test() ``` And run it: @@ -331,567 +411,334 @@ node index.js ## API Endpoints -### GET `/health`: Returns heath check result +- **GET** `/health`: Returns the current state of the server: + - 503 -> `{"status": "loading model"}` if the model is still being loaded. + - 500 -> `{"status": "error"}` if the model failed to load. + - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. + - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available. + - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available. -**Response format** + If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. -- HTTP status code 503 - - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}` - - Explanation: the model is still being loaded. -- HTTP status code 200 - - Body: `{"status": "ok" }` - - Explanation: the model is successfully loaded and the server is ready. +- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. -### POST `/completion`: Given a `prompt`, it returns the predicted completion. + *Options:* -> [!IMPORTANT] -> -> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead. + `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true: -*Options:* + - The prompt is a string or an array with the first element given as a string + - The model's `tokenizer.ggml.add_bos_token` metadata is `true` + - The system prompt is empty -`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true: + `temperature`: Adjust the randomness of the generated text. Default: `0.8` - - The prompt is a string or an array with the first element given as a string - - The model's `tokenizer.ggml.add_bos_token` metadata is `true` + `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled. -These input shapes and data type are allowed for `prompt`: + `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0` - - Single string: `"string"` - - Single sequence of tokens: `[12, 34, 56]` - - Mixed tokens and strings: `[12, 34, "string", 56, 78]` + `top_k`: Limit the next token selection to the K most probable tokens. Default: `40` -Multiple prompts are also supported. In this case, the completion result will be an array. + `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95` - - Only strings: `["string1", "string2"]` - - Strings and sequences of tokens: `["string1", [12, 34, 56]]` - - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]` + `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05` -`temperature`: Adjust the randomness of the generated text. Default: `0.8` + `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. -`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled. + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. + By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. -`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0` + `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. -`top_k`: Limit the next token selection to the K most probable tokens. Default: `40` + `stop`: Specify a JSON array of stopping strings. + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` -`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95` + `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled. -`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05` + `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled. -`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. + `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1` -`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0` + `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. -`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. -By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. + `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true` -`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`. + `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled. -`stop`: Specify a JSON array of stopping strings. -These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` + `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. -`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled. + `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`. -`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1` + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. -`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. + `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` -`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled. + `mirostat_eta`: Set the Mirostat learning rate, parameter eta. Default: `0.1` -`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. + `grammar`: Set grammar for grammar-based sampling. Default: no grammar -`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. + `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features. Default: no JSON schema. -`dry_base`: Set the DRY repetition penalty base value. Default: `1.75` + `seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. -`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` + `ignore_eos`: Ignore end of stream token and continue generating. Default: `false` -`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size. + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]` -`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0` -`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled. + `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` -`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC) + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. -`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. + `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` -`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` + `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` -`mirostat_eta`: Set the Mirostat learning rate, parameter eta. Default: `0.1` + `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) -`grammar`: Set grammar for grammar-based sampling. Default: no grammar + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. -`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features. Default: no JSON schema. +### Result JSON -`seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. +- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. -`ignore_eos`: Ignore end of stream token and continue generating. Default: `false` +- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: -`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]` - -`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0` - -`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` - -`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled. - -`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. - -`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` - -`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` - -`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false` - -`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - -`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` - -`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. - -`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name. - -`lora`: A list of LoRA adapters to be applied to this specific request. Each object in the list must contain `id` and `scale` fields. For example: `[{"id": 0, "scale": 0.5}, {"id": 1, "scale": 1.1}]`. If a LoRA adapter is not specified in the list, its scale will default to `0.0`. Please note that requests with different LoRA configurations will not be batched together, which may result in performance degradation. - -**Response format** - -- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. - -- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements: - ``` - { - "content": "", - "tokens": [ generated token ids if requested ], +```json +{ + "content": "", + "probs": [ + { + "prob": float, + "tok_str": "" + }, + { + "prob": float, + "tok_str": "" + }, ... - "probs": [ - { - "id": , - "logprob": float, - "token": "", - "bytes": [int, int, ...], - "top_logprobs": [ - { - "id": , - "logprob": float, - "token": "", - "bytes": [int, int, ...], - }, - { - "id": , - "logprob": float, - "token": "", - "bytes": [int, int, ...], - }, - ... - ] - }, - { - "id": , - "logprob": float, - "token": "", - "bytes": [int, int, ...], - "top_logprobs": [ - ... - ] - }, - ... - ] - }, - ``` - Please note that if `post_sampling_probs` is set to `true`: - - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0 - - `top_logprobs` will be replaced with `top_probs`. Each element contains: - - `id`: token ID - - `token`: token in string - - `bytes`: token in bytes - - `prob`: token probability, with the value between 0.0 and 1.0 - - Number of elements in `top_probs` may be less than `n_probs` + ] +}, +``` + +Notice that each `probs` is an array of length `n_probs`. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. -- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). -- `model`: The model alias (for model path, please use `/props` endpoint) -- `prompt`: The processed `prompt` (special tokens may be added) -- `stop_type`: Indicating whether the completion has stopped. Possible values are: - - `none`: Generating (not stopped) - - `eos`: Stopped because it encountered the EOS token - - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered - - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided +- `model`: The path to the model loaded with `-m` +- `prompt`: The provided `prompt` +- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token +- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered +- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word) - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`) - `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) +- **POST** `/tokenize`: Tokenize a given text. -### POST `/tokenize`: Tokenize a given text + *Options:* -*Options:* + `content`: Set the text to tokenize. -`content`: (Required) The text to tokenize. + `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` -`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` +- **POST** `/detokenize`: Convert tokens to text. -`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` + *Options:* -**Response:** + `tokens`: Set the tokens to detokenize. -Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise. +- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. + *Options:* -If `with_pieces` is `false`: -```json -{ - "tokens": [123, 456, 789] -} -``` + `content`: Set the text to process. -If `with_pieces` is `true`: -```json -{ - "tokens": [ - {"id": 123, "piece": "Hello"}, - {"id": 456, "piece": " world"}, - {"id": 789, "piece": "!"} - ] -} -``` + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. -With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k -``` -{ - "tokens": [ - {"id": 198, "piece": [195]}, // hex C3 - {"id": 164, "piece": [161]} // hex A1 - ] -} -``` +- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. -### POST `/detokenize`: Convert tokens to text + *Options:* -*Options:* + `input_prefix`: Set the prefix of the code to infill. -`tokens`: Set the tokens to detokenize. + `input_suffix`: Set the suffix of the code to infill. -### POST `/apply-template`: Apply chat template to a conversation + It also accepts all the options of `/completion` except `stream` and `prompt`. -Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response. +- **GET** `/props`: Return current server settings. -*Options:* - -`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`. - -**Response format** - -Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format. - -### POST `/embedding`: Generate embedding of a given text - -> [!IMPORTANT] -> -> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead. - -The same as [the embedding example](../embedding) does. - -*Options:* - -`content`: Set the text to process. - -`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. - -### POST `/reranking`: Rerank documents according to a given query - -Similar to https://jina.ai/reranker/ but might change in the future. -Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options. - -*Options:* - -`query`: The query against which the documents will be ranked. - -`documents`: An array strings representing the documents to be ranked. - -*Aliases:* - - `/rerank` - - `/v1/rerank` - - `/v1/reranking` - -*Examples:* - -```shell -curl http://127.0.0.1:8012/v1/rerank \ - -H "Content-Type: application/json" \ - -d '{ - "model": "some-model", - "query": "What is panda?", - "top_n": 3, - "documents": [ - "hi", - "it is a bear", - "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." - ] - }' | jq -``` - -### POST `/infill`: For code infilling. - -Takes a prefix and a suffix and returns the predicted completion as stream. - -*Options:* - -- `input_prefix`: Set the prefix of the code to infill. -- `input_suffix`: Set the suffix of the code to infill. -- `input_extra`: Additional context inserted before the FIM prefix. -- `prompt`: Added after the `FIM_MID` token - -`input_extra` is array of `{"filename": string, "text": string}` objects. - -The endpoint also accepts all the options of `/completion`. - -If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used: - -```txt -myproject -{chunk 0 filename} -{chunk 0 text} -{chunk 1 filename} -{chunk 1 text} -... -filename -[input_prefix][input_suffix][prompt] -``` - -If the tokens are missing, then the extra context is simply prefixed at the start: - -```txt -[input_extra][input_prefix][input_suffix][prompt] -``` - -### **GET** `/props`: Get server global properties. - -This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` - -**Response format** +### Result JSON ```json { - "default_generation_settings": { - "id": 0, - "id_task": -1, - "n_ctx": 1024, - "speculative": false, - "is_processing": false, - "params": { - "n_predict": -1, - "seed": 4294967295, - "temperature": 0.800000011920929, - "dynatemp_range": 0.0, - "dynatemp_exponent": 1.0, - "top_k": 40, - "top_p": 0.949999988079071, - "min_p": 0.05000000074505806, - "xtc_probability": 0.0, - "xtc_threshold": 0.10000000149011612, - "typical_p": 1.0, - "repeat_last_n": 64, - "repeat_penalty": 1.0, - "presence_penalty": 0.0, - "frequency_penalty": 0.0, - "dry_multiplier": 0.0, - "dry_base": 1.75, - "dry_allowed_length": 2, - "dry_penalty_last_n": -1, - "dry_sequence_breakers": [ - "\n", - ":", - "\"", - "*" - ], - "mirostat": 0, - "mirostat_tau": 5.0, - "mirostat_eta": 0.10000000149011612, - "stop": [], - "max_tokens": -1, - "n_keep": 0, - "n_discard": 0, - "ignore_eos": false, - "stream": true, - "n_probs": 0, - "min_keep": 0, - "grammar": "", - "samplers": [ - "dry", - "top_k", - "typ_p", - "top_p", - "min_p", - "xtc", - "temperature" - ], - "speculative.n_max": 16, - "speculative.n_min": 5, - "speculative.p_min": 0.8999999761581421, - "timings_per_token": false - }, - "prompt": "", - "next_token": { - "has_next_token": true, - "has_new_line": false, - "n_remain": -1, - "n_decoded": 0, - "stopping_word": "" - } - }, + "assistant_name": "", + "user_name": "", + "default_generation_settings": { ... }, "total_slots": 1, - "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", - "chat_template": "...", - "build_info": "b(build number)-(build commit hash)" + "chat_template": "" } ``` +- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. +- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) -- `model_path` - the path to model file (same with `-m` argument) - `chat_template` - the model's original Jinja2 prompt template -### POST `/props`: Change server global properties. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. -To use this endpoint with POST method, you need to start server with `--props` + *Options:* -*Options:* + See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. -- None yet + The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers. -### POST `/embeddings`: non-OpenAI-compatible embeddings API + *Examples:* -This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm. + You can use either Python `openai` library with appropriate checkpoints: -Note that the response format of this endpoint is different from `/v1/embeddings`. + ```python + import openai -*Options:* + client = openai.OpenAI( + base_url="http://localhost:8080/v1", # "http://:port" + api_key = "sk-no-key-required" + ) -Same as the `/v1/embeddings` endpoint. - -*Examples:* - -Same as the `/v1/embeddings` endpoint. - -**Response format** - -``` -[ - { - "index": 0, - "embedding": [ - [ ... embeddings for token 0 ... ], - [ ... embeddings for token 1 ... ], - [ ... ] - [ ... embeddings for token N-1 ... ], + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."}, + {"role": "user", "content": "Write a limerick about python exceptions"} ] - }, - ... - { - "index": P, - "embedding": [ - [ ... embeddings for token 0 ... ], - [ ... embeddings for token 1 ... ], - [ ... ] - [ ... embeddings for token N-1 ... ], + ) + + print(completion.choices[0].message) + ``` + + ... or raw HTTP requests: + + ```shell + curl http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "system", + "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests." + }, + { + "role": "user", + "content": "Write a limerick about python exceptions" + } ] - } -] -``` + }' + ``` -### GET `/slots`: Returns the current slots processing state +- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API. -> [!WARNING] -> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments. + *Options:* -This endpoint is disabled by default and can be enabled with `--slots` + See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). -If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots. + *Examples:* -**Response format** + - input as string -Example: + ```shell + curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "input": "hello", + "model":"GPT-4", + "encoding_format": "float" + }' + ``` + + - `input` as string array + + ```shell + curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "input": ["hello", "world"], + "model":"GPT-4", + "encoding_format": "float" + }' + ``` + +- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. + +### Result JSON ```json [ - { - "id": 0, - "id_task": -1, - "n_ctx": 1024, - "speculative": false, - "is_processing": false, - "params": { - "n_predict": -1, - "seed": 4294967295, - "temperature": 0.800000011920929, - "dynatemp_range": 0.0, - "dynatemp_exponent": 1.0, - "top_k": 40, - "top_p": 0.949999988079071, - "min_p": 0.05000000074505806, - "xtc_probability": 0.0, - "xtc_threshold": 0.10000000149011612, - "typical_p": 1.0, - "repeat_last_n": 64, - "repeat_penalty": 1.0, - "presence_penalty": 0.0, - "frequency_penalty": 0.0, - "dry_multiplier": 0.0, - "dry_base": 1.75, - "dry_allowed_length": 2, - "dry_penalty_last_n": -1, - "dry_sequence_breakers": [ - "\n", - ":", - "\"", - "*" - ], - "mirostat": 0, - "mirostat_tau": 5.0, - "mirostat_eta": 0.10000000149011612, - "stop": [], - "max_tokens": -1, - "n_keep": 0, - "n_discard": 0, - "ignore_eos": false, - "stream": true, - "n_probs": 0, - "min_keep": 0, - "grammar": "", - "samplers": [ - "dry", - "top_k", - "typ_p", - "top_p", - "min_p", - "xtc", - "temperature" - ], - "speculative.n_max": 16, - "speculative.n_min": 5, - "speculative.p_min": 0.8999999761581421, - "timings_per_token": false - }, - "prompt": "", - "next_token": { - "has_next_token": true, - "has_new_line": false, - "n_remain": -1, - "n_decoded": 0, - "stopping_word": "" + { + "dynatemp_exponent": 1.0, + "dynatemp_range": 0.0, + "frequency_penalty": 0.0, + "grammar": "", + "id": 0, + "ignore_eos": false, + "logit_bias": [], + "min_p": 0.05000000074505806, + "mirostat": 0, + "mirostat_eta": 0.10000000149011612, + "mirostat_tau": 5.0, + "model": "llama-2-7b-32k-instruct.Q2_K.gguf", + "n_ctx": 2048, + "n_keep": 0, + "n_predict": 100000, + "n_probs": 0, + "next_token": { + "has_next_token": true, + "n_remain": -1, + "n_decoded": 0, + "stopped_eos": false, + "stopped_limit": false, + "stopped_word": false, + "stopping_word": "" + }, + "penalize_nl": true, + "penalty_prompt_tokens": [], + "presence_penalty": 0.0, + "prompt": "Say hello to llama.cpp", + "repeat_last_n": 64, + "repeat_penalty": 1.100000023841858, + "samplers": [ + "top_k", + "tfs_z", + "typical_p", + "top_p", + "min_p", + "temperature" + ], + "seed": 42, + "state": 1, + "stop": [ + "\n" + ], + "stream": false, + "task_id": 0, + "temperature": 0.0, + "tfs_z": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "typical_p": 1.0, + "use_penalty_prompt_tokens": false } - } ] ``` -### GET `/metrics`: Prometheus compatible metrics exporter - -This endpoint is only accessible if `--metrics` is set. +- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: Available metrics: - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. @@ -903,13 +750,13 @@ Available metrics: - `llamacpp:requests_processing`: Number of requests processing. - `llamacpp:requests_deferred`: Number of requests deferred. -### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. +- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. -*Options:* + *Options:* -`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. + `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. -**Response format** +### Result JSON ```json { @@ -923,13 +770,13 @@ Available metrics: } ``` -### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. +- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. -*Options:* + *Options:* -`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. + `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. -**Response format** +### Result JSON ```json { @@ -943,9 +790,9 @@ Available metrics: } ``` -### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. +- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. -**Response format** +### Result JSON ```json { @@ -954,385 +801,30 @@ Available metrics: } ``` -### GET `/lora-adapters`: Get list of all LoRA adapters - -This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` - -By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply` - -Please note that this value will be overwritten by the `lora` field for each request. - -If an adapter is disabled, the scale will be set to 0. - -**Response format** - -```json -[ - { - "id": 0, - "path": "my_adapter_1.gguf", - "scale": 0.0 - }, - { - "id": 1, - "path": "my_adapter_2.gguf", - "scale": 0.0 - } -] -``` - -### POST `/lora-adapters`: Set list of LoRA adapters - -This sets the global scale for LoRA adapters. Please note that this value will be overwritten by the `lora` field for each request. - -To disable an adapter, either remove it from the list below, or set scale to 0. - -**Request format** - -To know the `id` of the adapter, use GET `/lora-adapters` - -```json -[ - {"id": 0, "scale": 0.2}, - {"id": 1, "scale": 0.8} -] -``` - -## OpenAI-compatible API Endpoints - -### GET `/v1/models`: OpenAI-compatible Model Info API - -Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models). - -The returned list always has one single element. - -By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`. - -Example: - -```json -{ - "object": "list", - "data": [ - { - "id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", - "object": "model", - "created": 1735142223, - "owned_by": "llamacpp", - "meta": { - "vocab_type": 2, - "n_vocab": 128256, - "n_ctx_train": 131072, - "n_embd": 4096, - "n_params": 8030261312, - "size": 4912898304 - } - } - ] -} -``` - -### POST `/v1/completions`: OpenAI-compatible Completions API - -Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. - -*Options:* - -See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions). - -llama.cpp `/completion`-specific features such as `mirostat` are supported. - -*Examples:* - -Example usage with `openai` python library: - -```python -import openai - -client = openai.OpenAI( - base_url="http://localhost:8080/v1", # "http://:port" - api_key = "sk-no-key-required" -) - -completion = client.completions.create( - model="davinci-002", - prompt="I believe the meaning of life is", - max_tokens=8 -) - -print(completion.choices[0].text) -``` - -### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API - -Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. - -*Options:* - -See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported. - -The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. - -*Examples:* - -You can use either Python `openai` library with appropriate checkpoints: - -```python -import openai - -client = openai.OpenAI( - base_url="http://localhost:8080/v1", # "http://:port" - api_key = "sk-no-key-required" -) - -completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."}, - {"role": "user", "content": "Write a limerick about python exceptions"} - ] -) - -print(completion.choices[0].message) -``` - -... or raw HTTP requests: - -```shell -curl http://localhost:8080/v1/chat/completions \ --H "Content-Type: application/json" \ --H "Authorization: Bearer no-key" \ --d '{ -"model": "gpt-3.5-turbo", -"messages": [ -{ - "role": "system", - "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests." -}, -{ - "role": "user", - "content": "Write a limerick about python exceptions" -} -] -}' -``` - -*Tool call support* - -[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639): - -- Requires `--jinja` flag -- Native tool call formats supported: - - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 - - Functionary v3.1 / v3.2 - - Hermes 2/3, Qwen 2.5 - - Mistral Nemo - - Firefunction v2 - - Command R7B - - DeepSeek R1 (WIP / seems reluctant to call any tools?) - -
- Show some common templates and which format handler they use - - | Template | Format | - |----------|--------| - | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls | - | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls | - | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls | - | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls | - | NexaAIDev-Octopus-v2.jinja | generic tool calls | - | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls | - | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls | - | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls | - | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls | - | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls | - | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls | - | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls | - | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls | - | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls | - | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls | - | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls | - | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls | - | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls | - | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls | - | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls | - | databricks-dbrx-instruct.jinja | generic tool calls | - | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls | - | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls | - | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls | - | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls | - | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls | - | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls | - | google-gemma-2-2b-it.jinja | generic tool calls | - | google-gemma-7b-it.jinja | generic tool calls | - | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls | - | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls | - | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls | - | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) | - | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls | - | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) | - | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) | - | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls | - | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls | - | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls | - | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls | - | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls | - | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls | - | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls | - | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls | - | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls | - | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls | - | mlabonne-AlphaMonarch-7B.jinja | generic tool calls | - | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) | - | openchat-openchat-3.5-0106.jinja | generic tool calls | - | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls | - - This table can be generated with: - - ```bash - ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null - -
- -- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs). - - Use `--chat-template-file` to override the template when appropriate (see examples below) - - Generic support may consume more tokens and be less efficient than a model's native format. - -- Run with: - - ```shell - # Native support: - llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M - llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L - llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M - llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M - - # Native support requires the right template for these GGUFs: - - llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \ - --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use ) - - llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \ - --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use ) - - llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \ - --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use ) - - llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \ - --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use ) - - # Generic format support - llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0 - llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0 - llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K - ``` - -- Test in CLI: - - ```bash - curl http://localhost:8080/v1/chat/completions -d '{ - "model": "gpt-3.5-turbo", - "tools": [ - { - "type":"function", - "function":{ - "name":"get_current_weather", - "description":"Get the current weather in a given location", - "parameters":{ - "type":"object", - "properties":{ - "location":{ - "type":"string", - "description":"The city and state, e.g. San Francisco, CA" - } - }, - "required":["location"] - } - } - } - ], - "messages": [ - { - "role": "user", - "content": "What is the weather like in Istanbul?." - } - ] - }' - ``` - -
- Show output - - ```json - { - "choices": [ - { - "finish_reason": "tool", - "index": 0, - "message": { - "content": null, - "tool_calls": [ - { - "name": "python", - "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}" - } - ], - "role": "assistant" - } - } - ], - "created": 1727287211, - "model": "gpt-3.5-turbo", - "object": "chat.completion", - "usage": { - "completion_tokens": 16, - "prompt_tokens": 44, - "total_tokens": 60 - }, - "id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8" - } - ``` - -
- -### POST `/v1/embeddings`: OpenAI-compatible embeddings API - -This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. - -*Options:* - -See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). - -*Examples:* - -- input as string - - ```shell - curl http://localhost:8080/v1/embeddings \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer no-key" \ - -d '{ - "input": "hello", - "model":"GPT-4", - "encoding_format": "float" - }' - ``` - -- `input` as string array - - ```shell - curl http://localhost:8080/v1/embeddings \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer no-key" \ - -d '{ - "input": ["hello", "world"], - "model":"GPT-4", - "encoding_format": "float" - }' - ``` - ## More examples +### Change system prompt on runtime + +To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once. + +`prompt`: Specify a context that you want all connecting clients to respect. + +`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint. + +`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint. + +```json +{ + "system_prompt": { + "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:", + "anti_prompt": "User:", + "assistant_name": "Assistant:" + } +} +``` + +**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`. + ### Interactive mode Check the sample in [chat.mjs](chat.mjs). @@ -1396,16 +888,6 @@ Apart from error types supported by OAI, we also have custom types that are spec } ``` -### Legacy completion web UI - -A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy` - -For example: - -```sh -./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy -``` - ### Extending or building alternative Web Front End You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 9549795ec..0f18ca396 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/). SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. -Example (assuming golang >= 1.21 is installed): +Example: ```shell go install go.k6.io/xk6/cmd/xk6@latest -$GOPATH/bin/xk6 build master \ +xk6 build master \ --with github.com/phymbert/xk6-sse ``` @@ -33,13 +33,14 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1 Example: ```shell -llama-server --host localhost --port 8080 \ +server --host localhost --port 8080 \ --model ggml-model-q4_0.gguf \ --cont-batching \ --metrics \ --parallel 8 \ --batch-size 512 \ --ctx-size 4096 \ + --log-format text \ -ngl 33 ``` diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 5cc6f92ab..2daac0884 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -189,12 +189,12 @@ xychart-beta "pp": { "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), - "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0, + "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), }, "tg": { "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), - "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0, + "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), }, } with open("results.github.env", 'a') as github_env: @@ -214,14 +214,11 @@ def start_benchmark(args): k6_args = [ 'run', args.scenario, '--no-color', - '--no-connection-reuse', - '--no-vu-connection-reuse', ] k6_args.extend(['--duration', args.duration]) k6_args.extend(['--iterations', args.n_prompts]) k6_args.extend(['--vus', args.parallel]) k6_args.extend(['--summary-export', 'k6-results.json']) - k6_args.extend(['--out', 'csv=k6-results.csv']) args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) print(f"bench: starting k6 with: {args}") @@ -234,7 +231,7 @@ def start_server(args): server_process = start_server_background(args) attempts = 0 - max_attempts = 600 + max_attempts = 20 if 'GITHUB_ACTIONS' in os.environ: max_attempts *= 2 @@ -245,15 +242,7 @@ def start_server(args): print(f"bench: waiting for server to start ...") time.sleep(0.5) - attempts = 0 - while not is_server_ready(args.host, args.port): - attempts += 1 - if attempts > max_attempts: - assert False, "server not ready" - print(f"bench: waiting for server to be ready ...") - time.sleep(0.5) - - print("bench: server started and ready.") + print("bench: server started.") return server_process @@ -266,6 +255,11 @@ def start_server_background(args): '--host', args.host, '--port', args.port, ] + model_file = args.model_path_prefix + os.path.sep + args.hf_file + model_dir = os.path.dirname(model_file) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + server_args.extend(['--model', model_file]) server_args.extend(['--hf-repo', args.hf_repo]) server_args.extend(['--hf-file', args.hf_file]) server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) @@ -278,6 +272,7 @@ def start_server_background(args): server_args.append('--cont-batching') server_args.append('--metrics') server_args.append('--flash-attn') + server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") pkwargs = { @@ -309,12 +304,6 @@ def is_server_listening(server_fqdn, server_port): return _is_server_listening -def is_server_ready(server_fqdn, server_port): - url = f"http://{server_fqdn}:{server_port}/health" - response = requests.get(url) - return response.status_code == 200 - - def escape_metric_name(metric_name): return re.sub('[^A-Z0-9]', '_', metric_name.upper()) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 2772bee5e..bdf4f5abc 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -56,7 +56,6 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') -const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -90,9 +89,6 @@ export default function () { ], "model": model, "stream": true, - "stream_options": { - "include_usage": true, // False to be supported in llama.cpp server - }, "seed": 42, "max_tokens": max_tokens, "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS @@ -109,20 +105,12 @@ export default function () { client.on('event', function (event) { if (promptEvalEndTime == null) { promptEvalEndTime = new Date() - llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3) - } - - if (event.data === '[DONE]' || event.data === '') { - return } let chunk = JSON.parse(event.data) - - if (chunk.choices && chunk.choices.length > 0) { - let choice = chunk.choices[0] - if (choice.finish_reason) { - finish_reason = choice.finish_reason - } + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason } if (chunk.usage) { diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs index 4fef5655a..a79c8a3cd 100644 --- a/examples/server/chat.mjs +++ b/examples/server/chat.mjs @@ -1,7 +1,7 @@ import * as readline from 'node:readline' import { stdin, stdout } from 'node:process' import { readFileSync } from 'node:fs' -import { SchemaConverter } from './public_legacy/json-schema-to-grammar.mjs' +import { SchemaConverter } from './public/json-schema-to-grammar.mjs' const args = process.argv.slice(2); const grammarJsonSchemaFile = args.find( diff --git a/examples/server/deps.sh b/examples/server/deps.sh new file mode 100755 index 000000000..d28378901 --- /dev/null +++ b/examples/server/deps.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Download and update deps for binary + +# get the directory of this script file +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +PUBLIC=$DIR/public + +echo "download js bundle files" +curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js +echo >> $PUBLIC/index.js # add newline diff --git a/examples/server/httplib.h b/examples/server/httplib.h index c2f12dd2a..f360bd93e 100644 --- a/examples/server/httplib.h +++ b/examples/server/httplib.h @@ -8,7 +8,7 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.18.5" +#define CPPHTTPLIB_VERSION "0.15.3" /* * Configuration @@ -18,12 +18,8 @@ #define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 #endif -#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND -#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND 10000 -#endif - #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT -#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 100 +#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 #endif #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND @@ -34,36 +30,20 @@ #define CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND 0 #endif -#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND -#define CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND 5 +#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND +#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5 #endif -#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND -#define CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND 0 +#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND +#define CPPHTTPLIB_READ_TIMEOUT_USECOND 0 #endif -#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND -#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND 5 +#ifndef CPPHTTPLIB_WRITE_TIMEOUT_SECOND +#define CPPHTTPLIB_WRITE_TIMEOUT_SECOND 5 #endif -#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND -#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND -#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND 300 -#endif - -#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND -#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND -#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND 5 -#endif - -#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND -#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND 0 +#ifndef CPPHTTPLIB_WRITE_TIMEOUT_USECOND +#define CPPHTTPLIB_WRITE_TIMEOUT_USECOND 0 #endif #ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND @@ -110,12 +90,8 @@ #define CPPHTTPLIB_TCP_NODELAY false #endif -#ifndef CPPHTTPLIB_IPV6_V6ONLY -#define CPPHTTPLIB_IPV6_V6ONLY false -#endif - #ifndef CPPHTTPLIB_RECV_BUFSIZ -#define CPPHTTPLIB_RECV_BUFSIZ size_t(16384u) +#define CPPHTTPLIB_RECV_BUFSIZ size_t(4096u) #endif #ifndef CPPHTTPLIB_COMPRESSION_BUFSIZ @@ -169,11 +145,11 @@ using ssize_t = long; #endif // _MSC_VER #ifndef S_ISREG -#define S_ISREG(m) (((m) & S_IFREG) == S_IFREG) +#define S_ISREG(m) (((m)&S_IFREG) == S_IFREG) #endif // S_ISREG #ifndef S_ISDIR -#define S_ISDIR(m) (((m) & S_IFDIR) == S_IFDIR) +#define S_ISDIR(m) (((m)&S_IFDIR) == S_IFDIR) #endif // S_ISDIR #ifndef NOMINMAX @@ -293,12 +269,7 @@ using socket_t = int; #include #include -#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER) -#if OPENSSL_VERSION_NUMBER < 0x1010107f -#error Please use OpenSSL or a current version of BoringSSL -#endif -#define SSL_get1_peer_certificate SSL_get_peer_certificate -#elif OPENSSL_VERSION_NUMBER < 0x30000000L +#if OPENSSL_VERSION_NUMBER < 0x30000000L #error Sorry, OpenSSL versions prior to 3.0.0 are not supported #endif @@ -341,63 +312,16 @@ make_unique(std::size_t n) { return std::unique_ptr(new RT[n]); } -namespace case_ignore { - -inline unsigned char to_lower(int c) { - const static unsigned char table[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, - 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, - 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, - 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, - 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 224, 225, 226, - 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, - 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, 224, - 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, - 255, - }; - return table[(unsigned char)(char)c]; -} - -inline bool equal(const std::string &a, const std::string &b) { - return a.size() == b.size() && - std::equal(a.begin(), a.end(), b.begin(), [](char ca, char cb) { - return to_lower(ca) == to_lower(cb); - }); -} - -struct equal_to { - bool operator()(const std::string &a, const std::string &b) const { - return equal(a, b); +struct ci { + bool operator()(const std::string &s1, const std::string &s2) const { + return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), + s2.end(), + [](unsigned char c1, unsigned char c2) { + return ::tolower(c1) < ::tolower(c2); + }); } }; -struct hash { - size_t operator()(const std::string &key) const { - return hash_core(key.data(), key.size(), 0); - } - - size_t hash_core(const char *s, size_t l, size_t h) const { - return (l == 0) ? h - : hash_core(s + 1, l - 1, - // Unsets the 6 high bits of h, therefore no - // overflow happens - (((std::numeric_limits::max)() >> 6) & - h * 33) ^ - static_cast(to_lower(*s))); - } -}; - -} // namespace case_ignore - // This is based on // "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189". @@ -503,9 +427,7 @@ enum StatusCode { NetworkAuthenticationRequired_511 = 511, }; -using Headers = - std::unordered_multimap; +using Headers = std::multimap; using Params = std::multimap; using Match = std::smatch; @@ -612,7 +534,6 @@ using Ranges = std::vector; struct Request { std::string method; std::string path; - Params params; Headers headers; std::string body; @@ -624,11 +545,11 @@ struct Request { // for server std::string version; std::string target; + Params params; MultipartFormDataMap files; Ranges ranges; Match matches; std::unordered_map path_params; - std::function is_connection_closed = []() { return true; }; // for client ResponseHandler response_handler; @@ -639,10 +560,8 @@ struct Request { #endif bool has_header(const std::string &key) const; - std::string get_header_value(const std::string &key, const char *def = "", - size_t id = 0) const; - uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0, - size_t id = 0) const; + std::string get_header_value(const std::string &key, size_t id = 0) const; + uint64_t get_header_value_u64(const std::string &key, size_t id = 0) const; size_t get_header_value_count(const std::string &key) const; void set_header(const std::string &key, const std::string &val); @@ -673,10 +592,8 @@ struct Response { std::string location; // Redirect location bool has_header(const std::string &key) const; - std::string get_header_value(const std::string &key, const char *def = "", - size_t id = 0) const; - uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0, - size_t id = 0) const; + std::string get_header_value(const std::string &key, size_t id = 0) const; + uint64_t get_header_value_u64(const std::string &key, size_t id = 0) const; size_t get_header_value_count(const std::string &key) const; void set_header(const std::string &key, const std::string &val); @@ -697,10 +614,6 @@ struct Response { const std::string &content_type, ContentProviderWithoutLength provider, ContentProviderResourceReleaser resource_releaser = nullptr); - void set_file_content(const std::string &path, - const std::string &content_type); - void set_file_content(const std::string &path); - Response() = default; Response(const Response &) = default; Response &operator=(const Response &) = default; @@ -718,8 +631,6 @@ struct Response { ContentProviderResourceReleaser content_provider_resource_releaser_; bool is_chunked_content_provider_ = false; bool content_provider_success_ = false; - std::string file_content_path_; - std::string file_content_content_type_; }; class Stream { @@ -735,6 +646,8 @@ public: virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0; virtual socket_t socket() const = 0; + template + ssize_t write_format(const char *fmt, const Args &...args); ssize_t write(const char *ptr); ssize_t write(const std::string &s); }; @@ -806,18 +719,13 @@ private: if (pool_.shutdown_ && pool_.jobs_.empty()) { break; } - fn = pool_.jobs_.front(); + fn = std::move(pool_.jobs_.front()); pool_.jobs_.pop_front(); } assert(true == static_cast(fn)); fn(); } - -#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) && \ - !defined(LIBRESSL_VERSION_NUMBER) - OPENSSL_thread_stop(); -#endif } ThreadPool &pool_; @@ -879,6 +787,7 @@ public: bool match(Request &request) const override; private: + static constexpr char marker = ':'; // Treat segment separators as the end of path parameter capture // Does not need to handle query parameters as they are parsed before path // matching @@ -962,13 +871,8 @@ public: Server &set_default_file_mimetype(const std::string &mime); Server &set_file_request_handler(Handler handler); - template - Server &set_error_handler(ErrorHandlerFunc &&handler) { - return set_error_handler_core( - std::forward(handler), - std::is_convertible{}); - } - + Server &set_error_handler(HandlerWithResponse handler); + Server &set_error_handler(Handler handler); Server &set_exception_handler(ExceptionHandler handler); Server &set_pre_routing_handler(HandlerWithResponse handler); Server &set_post_routing_handler(Handler handler); @@ -978,7 +882,6 @@ public: Server &set_address_family(int family); Server &set_tcp_nodelay(bool on); - Server &set_ipv6_v6only(bool on); Server &set_socket_options(SocketOptions socket_options); Server &set_default_headers(Headers headers); @@ -1011,24 +914,21 @@ public: bool is_running() const; void wait_until_ready() const; void stop(); - void decommission(); std::function new_task_queue; protected: - bool process_request(Stream &strm, const std::string &remote_addr, - int remote_port, const std::string &local_addr, - int local_port, bool close_connection, + bool process_request(Stream &strm, bool close_connection, bool &connection_closed, const std::function &setup_request); std::atomic svr_sock_{INVALID_SOCKET}; size_t keep_alive_max_count_ = CPPHTTPLIB_KEEPALIVE_MAX_COUNT; time_t keep_alive_timeout_sec_ = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND; - time_t read_timeout_sec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND; - time_t read_timeout_usec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND; - time_t write_timeout_sec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND; - time_t write_timeout_usec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND; + time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND; + time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND; + time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND; + time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND; time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND; time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND; size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH; @@ -1043,9 +943,6 @@ private: static std::unique_ptr make_matcher(const std::string &pattern); - Server &set_error_handler_core(HandlerWithResponse handler, std::true_type); - Server &set_error_handler_core(Handler handler, std::false_type); - socket_t create_server_socket(const std::string &host, int port, int socket_flags, SocketOptions socket_options) const; @@ -1088,7 +985,7 @@ private: virtual bool process_and_close_socket(socket_t sock); std::atomic is_running_{false}; - std::atomic is_decommisioned{false}; + std::atomic done_{false}; struct MountPointEntry { std::string mount_point; @@ -1121,7 +1018,6 @@ private: int address_family_ = AF_UNSPEC; bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; - bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; SocketOptions socket_options_ = default_socket_options; Headers default_headers_; @@ -1141,7 +1037,6 @@ enum class Error { SSLConnection, SSLLoadingCerts, SSLServerVerification, - SSLServerHostnameVerification, UnsupportedMultipartBoundaryChars, Compression, ConnectionTimeout, @@ -1179,10 +1074,9 @@ public: // Request Headers bool has_request_header(const std::string &key) const; std::string get_request_header_value(const std::string &key, - const char *def = "", size_t id = 0) const; uint64_t get_request_header_value_u64(const std::string &key, - uint64_t def = 0, size_t id = 0) const; + size_t id = 0) const; size_t get_request_header_value_count(const std::string &key) const; private: @@ -1246,18 +1140,10 @@ public: const std::string &content_type); Result Post(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); Result Post(const std::string &path, const std::string &body, const std::string &content_type); - Result Post(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Post(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1273,8 +1159,6 @@ public: Result Post(const std::string &path, const Params ¶ms); Result Post(const std::string &path, const Headers &headers, const Params ¶ms); - Result Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); Result Post(const std::string &path, const MultipartFormDataItems &items); Result Post(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1289,18 +1173,10 @@ public: const std::string &content_type); Result Put(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); Result Put(const std::string &path, const std::string &body, const std::string &content_type); - Result Put(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Put(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); Result Put(const std::string &path, @@ -1315,8 +1191,6 @@ public: Result Put(const std::string &path, const Params ¶ms); Result Put(const std::string &path, const Headers &headers, const Params ¶ms); - Result Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); Result Put(const std::string &path, const MultipartFormDataItems &items); Result Put(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1329,23 +1203,13 @@ public: Result Patch(const std::string &path); Result Patch(const std::string &path, const char *body, size_t content_length, const std::string &content_type); - Result Patch(const std::string &path, const char *body, size_t content_length, - const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); Result Patch(const std::string &path, const std::string &body, const std::string &content_type); - Result Patch(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Patch(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1363,24 +1227,13 @@ public: Result Delete(const std::string &path, const Headers &headers); Result Delete(const std::string &path, const char *body, size_t content_length, const std::string &content_type); - Result Delete(const std::string &path, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); Result Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); Result Delete(const std::string &path, const std::string &body, const std::string &content_type); - Result Delete(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Delete(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Options(const std::string &path); Result Options(const std::string &path, const Headers &headers); @@ -1405,7 +1258,6 @@ public: void set_address_family(int family); void set_tcp_nodelay(bool on); - void set_ipv6_v6only(bool on); void set_socket_options(SocketOptions socket_options); void set_connection_timeout(time_t sec, time_t usec = 0); @@ -1457,8 +1309,6 @@ public: #ifdef CPPHTTPLIB_OPENSSL_SUPPORT void enable_server_certificate_verification(bool enabled); - void enable_server_hostname_verification(bool enabled); - void set_server_certificate_verifier(std::function verifier); #endif void set_logger(Logger logger); @@ -1525,10 +1375,10 @@ protected: time_t connection_timeout_sec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND; time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND; - time_t read_timeout_sec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND; - time_t read_timeout_usec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND; - time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND; - time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND; + time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND; + time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND; + time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND; + time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND; std::string basic_auth_username_; std::string basic_auth_password_; @@ -1545,7 +1395,6 @@ protected: int address_family_ = AF_UNSPEC; bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; - bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; SocketOptions socket_options_ = nullptr; bool compress_ = false; @@ -1573,8 +1422,6 @@ protected: #ifdef CPPHTTPLIB_OPENSSL_SUPPORT bool server_certificate_verification_ = true; - bool server_hostname_verification_ = true; - std::function server_certificate_verifier_; #endif Logger logger_; @@ -1583,9 +1430,6 @@ private: bool send_(Request &req, Response &res, Error &error); Result send_(Request &&req); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - bool is_ssl_peer_could_be_closed(SSL *ssl) const; -#endif socket_t create_client_socket(Error &error) const; bool read_response_line(Stream &strm, const Request &req, Response &res) const; @@ -1604,7 +1448,7 @@ private: const Headers &headers, const char *body, size_t content_length, ContentProvider content_provider, ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type, Progress progress); + const std::string &content_type); ContentProviderWithoutLength get_multipart_content_provider( const std::string &boundary, const MultipartFormDataItems &items, const MultipartFormDataProviderItems &provider_items) const; @@ -1633,7 +1477,6 @@ public: const std::string &client_key_path); Client(Client &&) = default; - Client &operator=(Client &&) = default; ~Client(); @@ -1680,18 +1523,10 @@ public: const std::string &content_type); Result Post(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); Result Post(const std::string &path, const std::string &body, const std::string &content_type); - Result Post(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Post(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1707,8 +1542,6 @@ public: Result Post(const std::string &path, const Params ¶ms); Result Post(const std::string &path, const Headers &headers, const Params ¶ms); - Result Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); Result Post(const std::string &path, const MultipartFormDataItems &items); Result Post(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1723,18 +1556,10 @@ public: const std::string &content_type); Result Put(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); Result Put(const std::string &path, const std::string &body, const std::string &content_type); - Result Put(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Put(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); Result Put(const std::string &path, @@ -1749,8 +1574,6 @@ public: Result Put(const std::string &path, const Params ¶ms); Result Put(const std::string &path, const Headers &headers, const Params ¶ms); - Result Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); Result Put(const std::string &path, const MultipartFormDataItems &items); Result Put(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1763,23 +1586,13 @@ public: Result Patch(const std::string &path); Result Patch(const std::string &path, const char *body, size_t content_length, const std::string &content_type); - Result Patch(const std::string &path, const char *body, size_t content_length, - const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); Result Patch(const std::string &path, const std::string &body, const std::string &content_type); - Result Patch(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Patch(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1797,24 +1610,13 @@ public: Result Delete(const std::string &path, const Headers &headers); Result Delete(const std::string &path, const char *body, size_t content_length, const std::string &content_type); - Result Delete(const std::string &path, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); Result Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); Result Delete(const std::string &path, const std::string &body, const std::string &content_type); - Result Delete(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); Result Delete(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); Result Options(const std::string &path); Result Options(const std::string &path, const Headers &headers); @@ -1883,8 +1685,6 @@ public: #ifdef CPPHTTPLIB_OPENSSL_SUPPORT void enable_server_certificate_verification(bool enabled); - void enable_server_hostname_verification(bool enabled); - void set_server_certificate_verifier(std::function verifier); #endif void set_logger(Logger logger); @@ -1930,9 +1730,6 @@ public: SSL_CTX *ssl_context() const; - void update_certs(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store = nullptr); - private: bool process_and_close_socket(socket_t sock) override; @@ -2013,58 +1810,68 @@ inline void duration_to_sec_and_usec(const T &duration, U callback) { callback(static_cast(sec), static_cast(usec)); } -inline bool is_numeric(const std::string &str) { - return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit); -} - inline uint64_t get_header_value_u64(const Headers &headers, - const std::string &key, uint64_t def, - size_t id, bool &is_invalid_value) { - is_invalid_value = false; + const std::string &key, size_t id, + uint64_t def) { auto rng = headers.equal_range(key); auto it = rng.first; std::advance(it, static_cast(id)); if (it != rng.second) { - if (is_numeric(it->second)) { - return std::strtoull(it->second.data(), nullptr, 10); - } else { - is_invalid_value = true; - } + return std::strtoull(it->second.data(), nullptr, 10); } return def; } -inline uint64_t get_header_value_u64(const Headers &headers, - const std::string &key, uint64_t def, - size_t id) { - bool dummy = false; - return get_header_value_u64(headers, key, def, id, dummy); -} - } // namespace detail inline uint64_t Request::get_header_value_u64(const std::string &key, - uint64_t def, size_t id) const { - return detail::get_header_value_u64(headers, key, def, id); + size_t id) const { + return detail::get_header_value_u64(headers, key, id, 0); } inline uint64_t Response::get_header_value_u64(const std::string &key, - uint64_t def, size_t id) const { - return detail::get_header_value_u64(headers, key, def, id); + size_t id) const { + return detail::get_header_value_u64(headers, key, id, 0); +} + +template +inline ssize_t Stream::write_format(const char *fmt, const Args &...args) { + const auto bufsiz = 2048; + std::array buf{}; + + auto sn = snprintf(buf.data(), buf.size() - 1, fmt, args...); + if (sn <= 0) { return sn; } + + auto n = static_cast(sn); + + if (n >= buf.size() - 1) { + std::vector glowable_buf(buf.size()); + + while (n >= glowable_buf.size() - 1) { + glowable_buf.resize(glowable_buf.size() * 2); + n = static_cast( + snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...)); + } + return write(&glowable_buf[0], n); + } else { + return write(buf.data(), n); + } } inline void default_socket_options(socket_t sock) { - int opt = 1; + int yes = 1; #ifdef _WIN32 setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - reinterpret_cast(&opt), sizeof(opt)); + reinterpret_cast(&yes), sizeof(yes)); + setsockopt(sock, SOL_SOCKET, SO_EXCLUSIVEADDRUSE, + reinterpret_cast(&yes), sizeof(yes)); #else #ifdef SO_REUSEPORT setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - reinterpret_cast(&opt), sizeof(opt)); + reinterpret_cast(&yes), sizeof(yes)); #else setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - reinterpret_cast(&opt), sizeof(opt)); + reinterpret_cast(&yes), sizeof(yes)); #endif #endif } @@ -2190,8 +1997,6 @@ inline std::string to_string(const Error error) { case Error::SSLConnection: return "SSL connection failed"; case Error::SSLLoadingCerts: return "SSL certificate loading failed"; case Error::SSLServerVerification: return "SSL server verification failed"; - case Error::SSLServerHostnameVerification: - return "SSL server hostname verification failed"; case Error::UnsupportedMultipartBoundaryChars: return "Unsupported HTTP multipart boundary characters"; case Error::Compression: return "Compression failed"; @@ -2211,9 +2016,8 @@ inline std::ostream &operator<<(std::ostream &os, const Error &obj) { } inline uint64_t Result::get_request_header_value_u64(const std::string &key, - uint64_t def, size_t id) const { - return detail::get_header_value_u64(request_headers_, key, def, id); + return detail::get_header_value_u64(request_headers_, key, id, 0); } template @@ -2276,36 +2080,6 @@ make_basic_authentication_header(const std::string &username, namespace detail { -#if defined(_WIN32) -inline std::wstring u8string_to_wstring(const char *s) { - std::wstring ws; - auto len = static_cast(strlen(s)); - auto wlen = ::MultiByteToWideChar(CP_UTF8, 0, s, len, nullptr, 0); - if (wlen > 0) { - ws.resize(wlen); - wlen = ::MultiByteToWideChar( - CP_UTF8, 0, s, len, - const_cast(reinterpret_cast(ws.data())), wlen); - if (wlen != static_cast(ws.size())) { ws.clear(); } - } - return ws; -} -#endif - -struct FileStat { - FileStat(const std::string &path); - bool is_file() const; - bool is_dir() const; - -private: -#if defined(_WIN32) - struct _stat st_; -#else - struct stat st_; -#endif - int ret_ = -1; -}; - std::string encode_query_param(const std::string &value); std::string decode_url(const std::string &s, bool convert_plus_to_space); @@ -2314,16 +2088,6 @@ void read_file(const std::string &path, std::string &out); std::string trim_copy(const std::string &s); -void divide( - const char *data, std::size_t size, char d, - std::function - fn); - -void divide( - const std::string &str, char d, - std::function - fn); - void split(const char *b, const char *e, char d, std::function fn); @@ -2335,23 +2099,18 @@ bool process_client_socket(socket_t sock, time_t read_timeout_sec, time_t write_timeout_usec, std::function callback); -socket_t create_client_socket(const std::string &host, const std::string &ip, - int port, int address_family, bool tcp_nodelay, - bool ipv6_v6only, SocketOptions socket_options, - time_t connection_timeout_sec, - time_t connection_timeout_usec, - time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, - time_t write_timeout_usec, - const std::string &intf, Error &error); +socket_t create_client_socket( + const std::string &host, const std::string &ip, int port, + int address_family, bool tcp_nodelay, SocketOptions socket_options, + time_t connection_timeout_sec, time_t connection_timeout_usec, + time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, + time_t write_timeout_usec, const std::string &intf, Error &error); const char *get_header_value(const Headers &headers, const std::string &key, - const char *def, size_t id); + size_t id = 0, const char *def = nullptr); std::string params_to_query_str(const Params ¶ms); -void parse_query_text(const char *data, std::size_t size, Params ¶ms); - void parse_query_text(const std::string &s, Params ¶ms); bool parse_multipart_boundary(const std::string &content_type, @@ -2511,70 +2270,15 @@ public: private: #if defined(_WIN32) - HANDLE hFile_ = NULL; - HANDLE hMapping_ = NULL; + HANDLE hFile_; + HANDLE hMapping_; #else - int fd_ = -1; + int fd_; #endif - size_t size_ = 0; - void *addr_ = nullptr; - bool is_open_empty_file = false; + size_t size_; + void *addr_; }; -// NOTE: https://www.rfc-editor.org/rfc/rfc9110#section-5 -namespace fields { - -inline bool is_token_char(char c) { - return std::isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' || - c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' || - c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~'; -} - -inline bool is_token(const std::string &s) { - if (s.empty()) { return false; } - for (auto c : s) { - if (!is_token_char(c)) { return false; } - } - return true; -} - -inline bool is_field_name(const std::string &s) { return is_token(s); } - -inline bool is_vchar(char c) { return c >= 33 && c <= 126; } - -inline bool is_obs_text(char c) { return 128 <= static_cast(c); } - -inline bool is_field_vchar(char c) { return is_vchar(c) || is_obs_text(c); } - -inline bool is_field_content(const std::string &s) { - if (s.empty()) { return false; } - - if (s.size() == 1) { - return is_field_vchar(s[0]); - } else if (s.size() == 2) { - return is_field_vchar(s[0]) && is_field_vchar(s[1]); - } else { - size_t i = 0; - - if (!is_field_vchar(s[i])) { return false; } - i++; - - while (i < s.size() - 1) { - auto c = s[i++]; - if (c == ' ' || c == '\t' || is_field_vchar(c)) { - } else { - return false; - } - } - - return is_field_vchar(s[i]); - } -} - -inline bool is_field_value(const std::string &s) { return is_field_content(s); } - -} // namespace fields - } // namespace detail // ---------------------------------------------------------------------------- @@ -2688,6 +2392,20 @@ inline std::string base64_encode(const std::string &in) { return out; } +inline bool is_file(const std::string &path) { +#ifdef _WIN32 + return _access_s(path.c_str(), 0) == 0; +#else + struct stat st; + return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode); +#endif +} + +inline bool is_dir(const std::string &path) { + struct stat st; + return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode); +} + inline bool is_valid_path(const std::string &path) { size_t level = 0; size_t i = 0; @@ -2730,21 +2448,6 @@ inline bool is_valid_path(const std::string &path) { return true; } -inline FileStat::FileStat(const std::string &path) { -#if defined(_WIN32) - auto wpath = u8string_to_wstring(path.c_str()); - ret_ = _wstat(wpath.c_str(), &st_); -#else - ret_ = stat(path.c_str(), &st_); -#endif -} -inline bool FileStat::is_file() const { - return ret_ >= 0 && S_ISREG(st_.st_mode); -} -inline bool FileStat::is_dir() const { - return ret_ >= 0 && S_ISDIR(st_.st_mode); -} - inline std::string encode_query_param(const std::string &value) { std::ostringstream escaped; escaped.fill('0'); @@ -2876,27 +2579,6 @@ inline std::string trim_double_quotes_copy(const std::string &s) { return s; } -inline void -divide(const char *data, std::size_t size, char d, - std::function - fn) { - const auto it = std::find(data, data + size, d); - const auto found = static_cast(it != data + size); - const auto lhs_data = data; - const auto lhs_size = static_cast(it - data); - const auto rhs_data = it + found; - const auto rhs_size = size - lhs_size - found; - - fn(lhs_data, lhs_size, rhs_data, rhs_size); -} - -inline void -divide(const std::string &str, char d, - std::function - fn) { - divide(str.data(), str.size(), d, std::move(fn)); -} - inline void split(const char *b, const char *e, char d, std::function fn) { return split(b, e, d, (std::numeric_limits::max)(), std::move(fn)); @@ -2954,10 +2636,6 @@ inline bool stream_line_reader::getline() { fixed_buffer_used_size_ = 0; glowable_buffer_.clear(); -#ifndef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR - char prev_byte = 0; -#endif - for (size_t i = 0;; i++) { char byte; auto n = strm_.read(&byte, 1); @@ -2974,12 +2652,7 @@ inline bool stream_line_reader::getline() { append(byte); -#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR if (byte == '\n') { break; } -#else - if (prev_byte == '\r' && byte == '\n') { break; } - prev_byte = byte; -#endif } return true; @@ -2998,7 +2671,16 @@ inline void stream_line_reader::append(char c) { } } -inline mmap::mmap(const char *path) { open(path); } +inline mmap::mmap(const char *path) +#if defined(_WIN32) + : hFile_(NULL), hMapping_(NULL) +#else + : fd_(-1) +#endif + , + size_(0), addr_(nullptr) { + open(path); +} inline mmap::~mmap() { close(); } @@ -3006,60 +2688,29 @@ inline bool mmap::open(const char *path) { close(); #if defined(_WIN32) - auto wpath = u8string_to_wstring(path); - if (wpath.empty()) { return false; } + std::wstring wpath; + for (size_t i = 0; i < strlen(path); i++) { + wpath += path[i]; + } -#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL); -#else - hFile_ = ::CreateFileW(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, - OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); -#endif if (hFile_ == INVALID_HANDLE_VALUE) { return false; } LARGE_INTEGER size{}; if (!::GetFileSizeEx(hFile_, &size)) { return false; } - // If the following line doesn't compile due to QuadPart, update Windows SDK. - // See: - // https://github.com/yhirose/cpp-httplib/issues/1903#issuecomment-2316520721 - if (static_cast(size.QuadPart) > - (std::numeric_limits::max)()) { - // `size_t` might be 32-bits, on 32-bits Windows. - return false; - } size_ = static_cast(size.QuadPart); -#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 hMapping_ = ::CreateFileMappingFromApp(hFile_, NULL, PAGE_READONLY, size_, NULL); -#else - hMapping_ = ::CreateFileMappingW(hFile_, NULL, PAGE_READONLY, 0, 0, NULL); -#endif - - // Special treatment for an empty file... - if (hMapping_ == NULL && size_ == 0) { - close(); - is_open_empty_file = true; - return true; - } if (hMapping_ == NULL) { close(); return false; } -#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 addr_ = ::MapViewOfFileFromApp(hMapping_, FILE_MAP_READ, 0, 0); -#else - addr_ = ::MapViewOfFile(hMapping_, FILE_MAP_READ, 0, 0, 0); -#endif - - if (addr_ == nullptr) { - close(); - return false; - } #else fd_ = ::open(path, O_RDONLY); if (fd_ == -1) { return false; } @@ -3072,26 +2723,22 @@ inline bool mmap::open(const char *path) { size_ = static_cast(sb.st_size); addr_ = ::mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0); +#endif - // Special treatment for an empty file... - if (addr_ == MAP_FAILED && size_ == 0) { + if (addr_ == nullptr) { close(); - is_open_empty_file = true; return false; } -#endif return true; } -inline bool mmap::is_open() const { - return is_open_empty_file ? true : addr_ != nullptr; -} +inline bool mmap::is_open() const { return addr_ != nullptr; } inline size_t mmap::size() const { return size_; } inline const char *mmap::data() const { - return is_open_empty_file ? "" : static_cast(addr_); + return static_cast(addr_); } inline void mmap::close() { @@ -3110,8 +2757,6 @@ inline void mmap::close() { ::CloseHandle(hFile_); hFile_ = INVALID_HANDLE_VALUE; } - - is_open_empty_file = false; #else if (addr_ != nullptr) { munmap(addr_, size_); @@ -3137,10 +2782,7 @@ template inline ssize_t handle_EINTR(T fn) { ssize_t res = 0; while (true) { res = fn(); - if (res < 0 && errno == EINTR) { - std::this_thread::sleep_for(std::chrono::microseconds{1}); - continue; - } + if (res < 0 && errno == EINTR) { continue; } break; } return res; @@ -3349,37 +2991,23 @@ private: }; #endif -inline bool keep_alive(const std::atomic &svr_sock, socket_t sock, - time_t keep_alive_timeout_sec) { +inline bool keep_alive(socket_t sock, time_t keep_alive_timeout_sec) { using namespace std::chrono; - - const auto interval_usec = - CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND; - - // Avoid expensive `steady_clock::now()` call for the first time - if (select_read(sock, 0, interval_usec) > 0) { return true; } - - const auto start = steady_clock::now() - microseconds{interval_usec}; - const auto timeout = seconds{keep_alive_timeout_sec}; - + auto start = steady_clock::now(); while (true) { - if (svr_sock == INVALID_SOCKET) { - break; // Server socket is closed - } - - auto val = select_read(sock, 0, interval_usec); + auto val = select_read(sock, 0, 10000); if (val < 0) { - break; // Ssocket error + return false; } else if (val == 0) { - if (steady_clock::now() - start > timeout) { - break; // Timeout - } + auto current = steady_clock::now(); + auto duration = duration_cast(current - start); + auto timeout = keep_alive_timeout_sec * 1000; + if (duration.count() > timeout) { return false; } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); } else { - return true; // Ready for read + return true; } } - - return false; } template @@ -3390,7 +3018,8 @@ process_server_socket_core(const std::atomic &svr_sock, socket_t sock, assert(keep_alive_max_count > 0); auto ret = false; auto count = keep_alive_max_count; - while (count > 0 && keep_alive(svr_sock, sock, keep_alive_timeout_sec)) { + while (svr_sock != INVALID_SOCKET && count > 0 && + keep_alive(sock, keep_alive_timeout_sec)) { auto close_connection = count == 1; auto connection_closed = false; ret = callback(close_connection, connection_closed); @@ -3434,29 +3063,10 @@ inline int shutdown_socket(socket_t sock) { #endif } -inline std::string escape_abstract_namespace_unix_domain(const std::string &s) { - if (s.size() > 1 && s[0] == '\0') { - auto ret = s; - ret[0] = '@'; - return ret; - } - return s; -} - -inline std::string -unescape_abstract_namespace_unix_domain(const std::string &s) { - if (s.size() > 1 && s[0] == '@') { - auto ret = s; - ret[0] = '\0'; - return ret; - } - return s; -} - template socket_t create_socket(const std::string &host, const std::string &ip, int port, int address_family, int socket_flags, bool tcp_nodelay, - bool ipv6_v6only, SocketOptions socket_options, + SocketOptions socket_options, BindOrConnect bind_or_connect) { // Get address info const char *node = nullptr; @@ -3465,7 +3075,7 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = IPPROTO_IP; + hints.ai_protocol = 0; if (!ip.empty()) { node = ip.c_str(); @@ -3483,32 +3093,20 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, const auto addrlen = host.length(); if (addrlen > sizeof(sockaddr_un::sun_path)) { return INVALID_SOCKET; } -#ifdef SOCK_CLOEXEC - auto sock = socket(hints.ai_family, hints.ai_socktype | SOCK_CLOEXEC, - hints.ai_protocol); -#else auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol); -#endif - if (sock != INVALID_SOCKET) { sockaddr_un addr{}; addr.sun_family = AF_UNIX; - - auto unescaped_host = unescape_abstract_namespace_unix_domain(host); - std::copy(unescaped_host.begin(), unescaped_host.end(), addr.sun_path); + std::copy(host.begin(), host.end(), addr.sun_path); hints.ai_addr = reinterpret_cast(&addr); hints.ai_addrlen = static_cast( sizeof(addr) - sizeof(addr.sun_path) + addrlen); -#ifndef SOCK_CLOEXEC fcntl(sock, F_SETFD, FD_CLOEXEC); -#endif - if (socket_options) { socket_options(sock); } - bool dummy; - if (!bind_or_connect(sock, hints, dummy)) { + if (!bind_or_connect(sock, hints)) { close_socket(sock); sock = INVALID_SOCKET; } @@ -3525,7 +3123,6 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, #endif return INVALID_SOCKET; } - auto se = detail::scope_exit([&] { freeaddrinfo(result); }); for (auto rp = result; rp; rp = rp->ai_next) { // Create a socket @@ -3551,18 +3148,11 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); } #else - -#ifdef SOCK_CLOEXEC - auto sock = - socket(rp->ai_family, rp->ai_socktype | SOCK_CLOEXEC, rp->ai_protocol); -#else auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); -#endif - #endif if (sock == INVALID_SOCKET) { continue; } -#if !defined _WIN32 && !defined SOCK_CLOEXEC +#ifndef _WIN32 if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) { close_socket(sock); continue; @@ -3570,38 +3160,39 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, #endif if (tcp_nodelay) { - auto opt = 1; + auto yes = 1; #ifdef _WIN32 setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - reinterpret_cast(&opt), sizeof(opt)); + reinterpret_cast(&yes), sizeof(yes)); #else setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - reinterpret_cast(&opt), sizeof(opt)); -#endif - } - - if (rp->ai_family == AF_INET6) { - auto opt = ipv6_v6only ? 1 : 0; -#ifdef _WIN32 - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - reinterpret_cast(&opt), sizeof(opt)); -#else - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - reinterpret_cast(&opt), sizeof(opt)); + reinterpret_cast(&yes), sizeof(yes)); #endif } if (socket_options) { socket_options(sock); } + if (rp->ai_family == AF_INET6) { + auto no = 0; +#ifdef _WIN32 + setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + reinterpret_cast(&no), sizeof(no)); +#else + setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + reinterpret_cast(&no), sizeof(no)); +#endif + } + // bind or connect - auto quit = false; - if (bind_or_connect(sock, *rp, quit)) { return sock; } + if (bind_or_connect(sock, *rp)) { + freeaddrinfo(result); + return sock; + } close_socket(sock); - - if (quit) { break; } } + freeaddrinfo(result); return INVALID_SOCKET; } @@ -3634,7 +3225,6 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { hints.ai_protocol = 0; if (getaddrinfo(host.c_str(), "0", &hints, &result)) { return false; } - auto se = detail::scope_exit([&] { freeaddrinfo(result); }); auto ret = false; for (auto rp = result; rp; rp = rp->ai_next) { @@ -3645,6 +3235,7 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { } } + freeaddrinfo(result); return ret; } @@ -3656,8 +3247,6 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { inline std::string if2ip(int address_family, const std::string &ifn) { struct ifaddrs *ifap; getifaddrs(&ifap); - auto se = detail::scope_exit([&] { freeifaddrs(ifap); }); - std::string addr_candidate; for (auto ifa = ifap; ifa; ifa = ifa->ifa_next) { if (ifa->ifa_addr && ifn == ifa->ifa_name && @@ -3667,6 +3256,7 @@ inline std::string if2ip(int address_family, const std::string &ifn) { auto sa = reinterpret_cast(ifa->ifa_addr); char buf[INET_ADDRSTRLEN]; if (inet_ntop(AF_INET, &sa->sin_addr, buf, INET_ADDRSTRLEN)) { + freeifaddrs(ifap); return std::string(buf, INET_ADDRSTRLEN); } } else if (ifa->ifa_addr->sa_family == AF_INET6) { @@ -3679,6 +3269,7 @@ inline std::string if2ip(int address_family, const std::string &ifn) { if (s6_addr_head == 0xfc || s6_addr_head == 0xfd) { addr_candidate = std::string(buf, INET6_ADDRSTRLEN); } else { + freeifaddrs(ifap); return std::string(buf, INET6_ADDRSTRLEN); } } @@ -3686,21 +3277,20 @@ inline std::string if2ip(int address_family, const std::string &ifn) { } } } + freeifaddrs(ifap); return addr_candidate; } #endif inline socket_t create_client_socket( const std::string &host, const std::string &ip, int port, - int address_family, bool tcp_nodelay, bool ipv6_v6only, - SocketOptions socket_options, time_t connection_timeout_sec, - time_t connection_timeout_usec, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, + int address_family, bool tcp_nodelay, SocketOptions socket_options, + time_t connection_timeout_sec, time_t connection_timeout_usec, + time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, time_t write_timeout_usec, const std::string &intf, Error &error) { auto sock = create_socket( - host, ip, port, address_family, 0, tcp_nodelay, ipv6_v6only, - std::move(socket_options), - [&](socket_t sock2, struct addrinfo &ai, bool &quit) -> bool { + host, ip, port, address_family, 0, tcp_nodelay, std::move(socket_options), + [&](socket_t sock2, struct addrinfo &ai) -> bool { if (!intf.empty()) { #ifdef USE_IF2IP auto ip_from_if = if2ip(address_family, intf); @@ -3724,10 +3314,7 @@ inline socket_t create_client_socket( } error = wait_until_socket_is_ready(sock2, connection_timeout_sec, connection_timeout_usec); - if (error != Error::Success) { - if (error == Error::ConnectionTimeout) { quit = true; } - return false; - } + if (error != Error::Success) { return false; } } set_nonblocking(sock2, false); @@ -3852,7 +3439,7 @@ inline unsigned int str2tag(const std::string &s) { namespace udl { -inline constexpr unsigned int operator""_t(const char *s, size_t l) { +inline constexpr unsigned int operator"" _t(const char *s, size_t l) { return str2tag_core(s, l, 0); } @@ -3937,9 +3524,8 @@ inline bool can_compress_content_type(const std::string &content_type) { case "application/protobuf"_t: case "application/xhtml+xml"_t: return true; - case "text/event-stream"_t: return false; - - default: return !content_type.rfind("text/", 0); + default: + return !content_type.rfind("text/", 0) && tag != "text/event-stream"_t; } } @@ -4176,8 +3762,8 @@ inline bool has_header(const Headers &headers, const std::string &key) { } inline const char *get_header_value(const Headers &headers, - const std::string &key, const char *def, - size_t id) { + const std::string &key, size_t id, + const char *def) { auto rng = headers.equal_range(key); auto it = rng.first; std::advance(it, static_cast(id)); @@ -4185,6 +3771,14 @@ inline const char *get_header_value(const Headers &headers, return def; } +inline bool compare_case_ignore(const std::string &a, const std::string &b) { + if (a.size() != b.size()) { return false; } + for (size_t i = 0; i < b.size(); i++) { + if (::tolower(a[i]) != ::tolower(b[i])) { return false; } + } + return true; +} + template inline bool parse_header(const char *beg, const char *end, T fn) { // Skip trailing spaces and tabs. @@ -4207,27 +3801,15 @@ inline bool parse_header(const char *beg, const char *end, T fn) { p++; } - if (p <= end) { + if (p < end) { auto key_len = key_end - beg; if (!key_len) { return false; } auto key = std::string(beg, key_end); - auto val = case_ignore::equal(key, "Location") + auto val = compare_case_ignore(key, "Location") ? std::string(p, end) : decode_url(std::string(p, end), false); - - // NOTE: From RFC 9110: - // Field values containing CR, LF, or NUL characters are - // invalid and dangerous, due to the varying ways that - // implementations might parse and interpret those - // characters; a recipient of CR, LF, or NUL within a field - // value MUST either reject the message or replace each of - // those characters with SP before further processing or - // forwarding of that message. - static const std::string CR_LF_NUL("\r\n\0", 3); - if (val.find_first_of(CR_LF_NUL) != std::string::npos) { return false; } - - fn(key, val); + fn(std::move(key), std::move(val)); return true; } @@ -4247,27 +3829,27 @@ inline bool read_headers(Stream &strm, Headers &headers) { if (line_reader.end_with_crlf()) { // Blank line indicates end of headers. if (line_reader.size() == 2) { break; } - } else { #ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR + } else { // Blank line indicates end of headers. if (line_reader.size() == 1) { break; } line_terminator_len = 1; -#else - continue; // Skip invalid line. -#endif } +#else + } else { + continue; // Skip invalid line. + } +#endif if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } // Exclude line terminator auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; - if (!parse_header(line_reader.ptr(), end, - [&](const std::string &key, std::string &val) { - headers.emplace(key, val); - })) { - return false; - } + parse_header(line_reader.ptr(), end, + [&](std::string &&key, std::string &&val) { + headers.emplace(std::move(key), std::move(val)); + }); } return true; @@ -4355,19 +3937,8 @@ inline bool read_content_chunked(Stream &strm, T &x, assert(chunk_len == 0); - // NOTE: In RFC 9112, '7.1 Chunked Transfer Coding' mentiones "The chunked - // transfer coding is complete when a chunk with a chunk-size of zero is - // received, possibly followed by a trailer section, and finally terminated by - // an empty line". https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1 - // - // In '7.1.3. Decoding Chunked', however, the pseudo-code in the section - // does't care for the existence of the final CRLF. In other words, it seems - // to be ok whether the final CRLF exists or not in the chunked data. - // https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1.3 - // - // According to the reference code in RFC 9112, cpp-htpplib now allows - // chuncked transfer coding data without the final CRLF. - if (!line_reader.getline()) { return true; } + // Trailer + if (!line_reader.getline()) { return false; } while (strcmp(line_reader.ptr(), "\r\n") != 0) { if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } @@ -4377,8 +3948,8 @@ inline bool read_content_chunked(Stream &strm, T &x, auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; parse_header(line_reader.ptr(), end, - [&](const std::string &key, const std::string &val) { - x.headers.emplace(key, val); + [&](std::string &&key, std::string &&val) { + x.headers.emplace(std::move(key), std::move(val)); }); if (!line_reader.getline()) { return false; } @@ -4388,8 +3959,8 @@ inline bool read_content_chunked(Stream &strm, T &x, } inline bool is_chunked_transfer_encoding(const Headers &headers) { - return case_ignore::equal( - get_header_value(headers, "Transfer-Encoding", "", 0), "chunked"); + return compare_case_ignore( + get_header_value(headers, "Transfer-Encoding", 0, ""), "chunked"); } template @@ -4455,14 +4026,8 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, } else if (!has_header(x.headers, "Content-Length")) { ret = read_content_without_length(strm, out); } else { - auto is_invalid_value = false; - auto len = get_header_value_u64(x.headers, "Content-Length", - std::numeric_limits::max(), - 0, is_invalid_value); - - if (is_invalid_value) { - ret = false; - } else if (len > payload_max_length) { + auto len = get_header_value_u64(x.headers, "Content-Length", 0, 0); + if (len > payload_max_length) { exceed_payload_max_length = true; skip_content_with_length(strm, len); ret = false; @@ -4477,36 +4042,13 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, } return ret; }); -} - -inline ssize_t write_request_line(Stream &strm, const std::string &method, - const std::string &path) { - std::string s = method; - s += " "; - s += path; - s += " HTTP/1.1\r\n"; - return strm.write(s.data(), s.size()); -} - -inline ssize_t write_response_line(Stream &strm, int status) { - std::string s = "HTTP/1.1 "; - s += std::to_string(status); - s += " "; - s += httplib::status_message(status); - s += "\r\n"; - return strm.write(s.data(), s.size()); -} +} // namespace detail inline ssize_t write_headers(Stream &strm, const Headers &headers) { ssize_t write_len = 0; for (const auto &x : headers) { - std::string s; - s = x.first; - s += ": "; - s += x.second; - s += "\r\n"; - - auto len = strm.write(s.data(), s.size()); + auto len = + strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str()); if (len < 0) { return len; } write_len += len; } @@ -4760,22 +4302,22 @@ inline std::string params_to_query_str(const Params ¶ms) { return query; } -inline void parse_query_text(const char *data, std::size_t size, - Params ¶ms) { +inline void parse_query_text(const std::string &s, Params ¶ms) { std::set cache; - split(data, data + size, '&', [&](const char *b, const char *e) { + split(s.data(), s.data() + s.size(), '&', [&](const char *b, const char *e) { std::string kv(b, e); if (cache.find(kv) != cache.end()) { return; } - cache.insert(std::move(kv)); + cache.insert(kv); std::string key; std::string val; - divide(b, static_cast(e - b), '=', - [&](const char *lhs_data, std::size_t lhs_size, const char *rhs_data, - std::size_t rhs_size) { - key.assign(lhs_data, lhs_size); - val.assign(rhs_data, rhs_size); - }); + split(b, e, '=', [&](const char *b2, const char *e2) { + if (key.empty()) { + key.assign(b2, e2); + } else { + val.assign(b2, e2); + } + }); if (!key.empty()) { params.emplace(decode_url(key, true), decode_url(val, true)); @@ -4783,10 +4325,6 @@ inline void parse_query_text(const char *data, std::size_t size, }); } -inline void parse_query_text(const std::string &s, Params ¶ms) { - parse_query_text(s.data(), s.size(), params); -} - inline bool parse_multipart_boundary(const std::string &content_type, std::string &boundary) { auto boundary_keyword = "boundary="; @@ -4827,44 +4365,35 @@ inline bool parse_range_header(const std::string &s, Ranges &ranges) { #else inline bool parse_range_header(const std::string &s, Ranges &ranges) try { #endif - auto is_valid = [](const std::string &str) { - return std::all_of(str.cbegin(), str.cend(), - [](unsigned char c) { return std::isdigit(c); }); - }; - - if (s.size() > 7 && s.compare(0, 6, "bytes=") == 0) { - const auto pos = static_cast(6); - const auto len = static_cast(s.size() - 6); + static auto re_first_range = std::regex(R"(bytes=(\d*-\d*(?:,\s*\d*-\d*)*))"); + std::smatch m; + if (std::regex_match(s, m, re_first_range)) { + auto pos = static_cast(m.position(1)); + auto len = static_cast(m.length(1)); auto all_valid_ranges = true; split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) { if (!all_valid_ranges) { return; } + static auto re_another_range = std::regex(R"(\s*(\d*)-(\d*))"); + std::cmatch cm; + if (std::regex_match(b, e, cm, re_another_range)) { + ssize_t first = -1; + if (!cm.str(1).empty()) { + first = static_cast(std::stoll(cm.str(1))); + } - const auto it = std::find(b, e, '-'); - if (it == e) { - all_valid_ranges = false; - return; + ssize_t last = -1; + if (!cm.str(2).empty()) { + last = static_cast(std::stoll(cm.str(2))); + } + + if (first != -1 && last != -1 && first > last) { + all_valid_ranges = false; + return; + } + ranges.emplace_back(std::make_pair(first, last)); } - - const auto lhs = std::string(b, it); - const auto rhs = std::string(it + 1, e); - if (!is_valid(lhs) || !is_valid(rhs)) { - all_valid_ranges = false; - return; - } - - const auto first = - static_cast(lhs.empty() ? -1 : std::stoll(lhs)); - const auto last = - static_cast(rhs.empty() ? -1 : std::stoll(rhs)); - if ((first == -1 && last == -1) || - (first != -1 && last != -1 && first > last)) { - all_valid_ranges = false; - return; - } - - ranges.emplace_back(first, last); }); - return all_valid_ranges && !ranges.empty(); + return all_valid_ranges; } return false; #ifdef CPPHTTPLIB_NO_EXCEPTIONS @@ -4923,7 +4452,7 @@ public: const auto header = buf_head(pos); if (!parse_header(header.data(), header.data() + header.size(), - [&](const std::string &, const std::string &) {})) { + [&](std::string &&, std::string &&) {})) { is_valid_ = false; return false; } @@ -5033,9 +4562,7 @@ private: const std::string &b) const { if (a.size() < b.size()) { return false; } for (size_t i = 0; i < b.size(); i++) { - if (case_ignore::to_lower(a[i]) != case_ignore::to_lower(b[i])) { - return false; - } + if (::tolower(a[i]) != ::tolower(b[i])) { return false; } } return true; } @@ -5118,6 +4645,16 @@ private: size_t buf_epos_ = 0; }; +inline std::string to_lower(const char *beg, const char *end) { + std::string out; + auto it = beg; + while (it != end) { + out += static_cast(::tolower(*it)); + it++; + } + return out; +} + inline std::string random_string(size_t length) { static const char data[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; @@ -5231,18 +4768,7 @@ inline bool range_error(Request &req, Response &res) { last_pos = contant_len - 1; } - // NOTE: RFC-9110 '14.1.2. Byte Ranges': - // A client can limit the number of bytes requested without knowing the - // size of the selected representation. If the last-pos value is absent, - // or if the value is greater than or equal to the current length of the - // representation data, the byte range is interpreted as the remainder of - // the representation (i.e., the server replaces the value of last-pos - // with a value that is one less than the current length of the selected - // representation). - // https://www.rfc-editor.org/rfc/rfc9110.html#section-14.1.2-6 - if (last_pos == -1 || last_pos >= contant_len) { - last_pos = contant_len - 1; - } + if (last_pos == -1) { last_pos = contant_len - 1; } // Range must be within content length if (!(0 <= first_pos && first_pos <= last_pos && @@ -5269,11 +4795,12 @@ inline bool range_error(Request &req, Response &res) { inline std::pair get_range_offset_and_length(Range r, size_t content_length) { + (void)(content_length); // patch to get rid of "unused parameter" on release build assert(r.first != -1 && r.second != -1); assert(0 <= r.first && r.first < static_cast(content_length)); assert(r.first <= r.second && r.second < static_cast(content_length)); - (void)(content_length); + return std::make_pair(r.first, static_cast(r.second - r.first) + 1); } @@ -5703,7 +5230,6 @@ inline void hosted_at(const std::string &hostname, #endif return; } - auto se = detail::scope_exit([&] { freeaddrinfo(result); }); for (auto rp = result; rp; rp = rp->ai_next) { const auto &addr = @@ -5715,6 +5241,8 @@ inline void hosted_at(const std::string &hostname, addrs.push_back(ip); } } + + freeaddrinfo(result); } inline std::string append_query_params(const std::string &path, @@ -5763,8 +5291,8 @@ inline bool Request::has_header(const std::string &key) const { } inline std::string Request::get_header_value(const std::string &key, - const char *def, size_t id) const { - return detail::get_header_value(headers, key, def, id); + size_t id) const { + return detail::get_header_value(headers, key, id, ""); } inline size_t Request::get_header_value_count(const std::string &key) const { @@ -5774,8 +5302,7 @@ inline size_t Request::get_header_value_count(const std::string &key) const { inline void Request::set_header(const std::string &key, const std::string &val) { - if (detail::fields::is_field_name(key) && - detail::fields::is_field_value(val)) { + if (!detail::has_crlf(key) && !detail::has_crlf(val)) { headers.emplace(key, val); } } @@ -5829,9 +5356,8 @@ inline bool Response::has_header(const std::string &key) const { } inline std::string Response::get_header_value(const std::string &key, - const char *def, size_t id) const { - return detail::get_header_value(headers, key, def, id); + return detail::get_header_value(headers, key, id, ""); } inline size_t Response::get_header_value_count(const std::string &key) const { @@ -5841,14 +5367,13 @@ inline size_t Response::get_header_value_count(const std::string &key) const { inline void Response::set_header(const std::string &key, const std::string &val) { - if (detail::fields::is_field_name(key) && - detail::fields::is_field_value(val)) { + if (!detail::has_crlf(key) && !detail::has_crlf(val)) { headers.emplace(key, val); } } inline void Response::set_redirect(const std::string &url, int stat) { - if (detail::fields::is_field_value(url)) { + if (!detail::has_crlf(url)) { set_header("Location", url); if (300 <= stat && stat < 400) { this->status = stat; @@ -5911,25 +5436,14 @@ inline void Response::set_chunked_content_provider( is_chunked_content_provider_ = true; } -inline void Response::set_file_content(const std::string &path, - const std::string &content_type) { - file_content_path_ = path; - file_content_content_type_ = content_type; -} - -inline void Response::set_file_content(const std::string &path) { - file_content_path_ = path; -} - // Result implementation inline bool Result::has_request_header(const std::string &key) const { return request_headers_.find(key) != request_headers_.end(); } inline std::string Result::get_request_header_value(const std::string &key, - const char *def, size_t id) const { - return detail::get_header_value(request_headers_, key, def, id); + return detail::get_header_value(request_headers_, key, id, ""); } inline size_t @@ -6070,8 +5584,6 @@ inline socket_t BufferStream::socket() const { return 0; } inline const std::string &BufferStream::get_buffer() const { return buffer; } inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { - static constexpr char marker[] = "/:"; - // One past the last ending position of a path param substring std::size_t last_param_end = 0; @@ -6084,14 +5596,13 @@ inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { #endif while (true) { - const auto marker_pos = pattern.find( - marker, last_param_end == 0 ? last_param_end : last_param_end - 1); + const auto marker_pos = pattern.find(marker, last_param_end); if (marker_pos == std::string::npos) { break; } static_fragments_.push_back( - pattern.substr(last_param_end, marker_pos - last_param_end + 1)); + pattern.substr(last_param_end, marker_pos - last_param_end)); - const auto param_name_start = marker_pos + 2; + const auto param_name_start = marker_pos + 1; auto sep_pos = pattern.find(separator, param_name_start); if (sep_pos == std::string::npos) { sep_pos = pattern.length(); } @@ -6153,7 +5664,7 @@ inline bool PathParamsMatcher::match(Request &request) const { request.path_params.emplace( param_name, request.path.substr(starting_pos, sep_pos - starting_pos)); - // Mark everything up to '/' as matched + // Mark everythin up to '/' as matched starting_pos = sep_pos + 1; } // Returns false if the path is longer than the pattern @@ -6252,8 +5763,7 @@ inline bool Server::set_base_dir(const std::string &dir, inline bool Server::set_mount_point(const std::string &mount_point, const std::string &dir, Headers headers) { - detail::FileStat stat(dir); - if (stat.is_dir()) { + if (detail::is_dir(dir)) { std::string mnt = !mount_point.empty() ? mount_point : "/"; if (!mnt.empty() && mnt[0] == '/') { base_dirs_.push_back({mnt, dir, std::move(headers)}); @@ -6290,14 +5800,12 @@ inline Server &Server::set_file_request_handler(Handler handler) { return *this; } -inline Server &Server::set_error_handler_core(HandlerWithResponse handler, - std::true_type) { +inline Server &Server::set_error_handler(HandlerWithResponse handler) { error_handler_ = std::move(handler); return *this; } -inline Server &Server::set_error_handler_core(Handler handler, - std::false_type) { +inline Server &Server::set_error_handler(Handler handler) { error_handler_ = [handler](const Request &req, Response &res) { handler(req, res); return HandlerResponse::Handled; @@ -6341,11 +5849,6 @@ inline Server &Server::set_tcp_nodelay(bool on) { return *this; } -inline Server &Server::set_ipv6_v6only(bool on) { - ipv6_v6only_ = on; - return *this; -} - inline Server &Server::set_socket_options(SocketOptions socket_options) { socket_options_ = std::move(socket_options); return *this; @@ -6397,27 +5900,27 @@ inline Server &Server::set_payload_max_length(size_t length) { inline bool Server::bind_to_port(const std::string &host, int port, int socket_flags) { - auto ret = bind_internal(host, port, socket_flags); - if (ret == -1) { is_decommisioned = true; } - return ret >= 0; + return bind_internal(host, port, socket_flags) >= 0; } inline int Server::bind_to_any_port(const std::string &host, int socket_flags) { - auto ret = bind_internal(host, 0, socket_flags); - if (ret == -1) { is_decommisioned = true; } - return ret; + return bind_internal(host, 0, socket_flags); } -inline bool Server::listen_after_bind() { return listen_internal(); } +inline bool Server::listen_after_bind() { + auto se = detail::scope_exit([&]() { done_ = true; }); + return listen_internal(); +} inline bool Server::listen(const std::string &host, int port, int socket_flags) { + auto se = detail::scope_exit([&]() { done_ = true; }); return bind_to_port(host, port, socket_flags) && listen_internal(); } inline bool Server::is_running() const { return is_running_; } inline void Server::wait_until_ready() const { - while (!is_running_ && !is_decommisioned) { + while (!is_running() && !done_) { std::this_thread::sleep_for(std::chrono::milliseconds{1}); } } @@ -6429,11 +5932,8 @@ inline void Server::stop() { detail::shutdown_socket(sock); detail::close_socket(sock); } - is_decommisioned = false; } -inline void Server::decommission() { is_decommisioned = true; } - inline bool Server::parse_request_line(const char *s, Request &req) const { auto len = strlen(s); if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; } @@ -6472,13 +5972,26 @@ inline bool Server::parse_request_line(const char *s, Request &req) const { } } - detail::divide(req.target, '?', - [&](const char *lhs_data, std::size_t lhs_size, - const char *rhs_data, std::size_t rhs_size) { - req.path = detail::decode_url( - std::string(lhs_data, lhs_size), false); - detail::parse_query_text(rhs_data, rhs_size, req.params); - }); + size_t count = 0; + + detail::split(req.target.data(), req.target.data() + req.target.size(), '?', + 2, [&](const char *b, const char *e) { + switch (count) { + case 0: + req.path = detail::decode_url(std::string(b, e), false); + break; + case 1: { + if (e - b > 0) { + detail::parse_query_text(std::string(b, e), req.params); + } + break; + } + default: break; + } + count++; + }); + + if (count > 2) { return false; } } return true; @@ -6517,24 +6030,23 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection, if (close_connection || req.get_header_value("Connection") == "close") { res.set_header("Connection", "close"); } else { - std::string s = "timeout="; - s += std::to_string(keep_alive_timeout_sec_); - s += ", max="; - s += std::to_string(keep_alive_max_count_); - res.set_header("Keep-Alive", s); + std::stringstream ss; + ss << "timeout=" << keep_alive_timeout_sec_ + << ", max=" << keep_alive_max_count_; + res.set_header("Keep-Alive", ss.str()); } - if ((!res.body.empty() || res.content_length_ > 0 || res.content_provider_) && - !res.has_header("Content-Type")) { + if (!res.has_header("Content-Type") && + (!res.body.empty() || res.content_length_ > 0 || res.content_provider_)) { res.set_header("Content-Type", "text/plain"); } - if (res.body.empty() && !res.content_length_ && !res.content_provider_ && - !res.has_header("Content-Length")) { + if (!res.has_header("Content-Length") && res.body.empty() && + !res.content_length_ && !res.content_provider_) { res.set_header("Content-Length", "0"); } - if (req.method == "HEAD" && !res.has_header("Accept-Ranges")) { + if (!res.has_header("Accept-Ranges") && req.method == "HEAD") { res.set_header("Accept-Ranges", "bytes"); } @@ -6543,7 +6055,12 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection, // Response line and headers { detail::BufferStream bstrm; - if (!detail::write_response_line(bstrm, res.status)) { return false; } + + if (!bstrm.write_format("HTTP/1.1 %d %s\r\n", res.status, + status_message(res.status))) { + return false; + } + if (!header_writer_(bstrm, res.headers)) { return false; } // Flush buffer @@ -6737,14 +6254,7 @@ inline bool Server::handle_file_request(const Request &req, Response &res, auto path = entry.base_dir + sub_path; if (path.back() == '/') { path += "index.html"; } - detail::FileStat stat(path); - - if (stat.is_dir()) { - res.set_redirect(sub_path + "/", StatusCode::MovedPermanently_301); - return true; - } - - if (stat.is_file()) { + if (detail::is_file(path)) { for (const auto &kv : entry.headers) { res.set_header(kv.first, kv.second); } @@ -6779,8 +6289,8 @@ Server::create_server_socket(const std::string &host, int port, SocketOptions socket_options) const { return detail::create_socket( host, std::string(), port, address_family_, socket_flags, tcp_nodelay_, - ipv6_v6only_, std::move(socket_options), - [](socket_t sock, struct addrinfo &ai, bool & /*quit*/) -> bool { + std::move(socket_options), + [](socket_t sock, struct addrinfo &ai) -> bool { if (::bind(sock, ai.ai_addr, static_cast(ai.ai_addrlen))) { return false; } @@ -6791,8 +6301,6 @@ Server::create_server_socket(const std::string &host, int port, inline int Server::bind_internal(const std::string &host, int port, int socket_flags) { - if (is_decommisioned) { return -1; } - if (!is_valid()) { return -1; } svr_sock_ = create_server_socket(host, port, socket_flags, socket_options_); @@ -6818,8 +6326,6 @@ inline int Server::bind_internal(const std::string &host, int port, } inline bool Server::listen_internal() { - if (is_decommisioned) { return false; } - auto ret = true; is_running_ = true; auto se = detail::scope_exit([&]() { is_running_ = false; }); @@ -6840,22 +6346,13 @@ inline bool Server::listen_internal() { #ifndef _WIN32 } #endif - -#if defined _WIN32 - // sockets conneced via WASAccept inherit flags NO_HANDLE_INHERIT, - // OVERLAPPED - socket_t sock = WSAAccept(svr_sock_, nullptr, nullptr, nullptr, 0); -#elif defined SOCK_CLOEXEC - socket_t sock = accept4(svr_sock_, nullptr, nullptr, SOCK_CLOEXEC); -#else socket_t sock = accept(svr_sock_, nullptr, nullptr); -#endif if (sock == INVALID_SOCKET) { if (errno == EMFILE) { // The per-process limit of open file descriptors has been reached. // Try to accept new connections after a short sleep. - std::this_thread::sleep_for(std::chrono::microseconds{1}); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); continue; } else if (errno == EINTR || errno == EAGAIN) { continue; @@ -6909,7 +6406,6 @@ inline bool Server::listen_internal() { task_queue->shutdown(); } - is_decommisioned = !ret; return ret; } @@ -7007,7 +6503,7 @@ inline bool Server::dispatch_request(Request &req, Response &res, inline void Server::apply_ranges(const Request &req, Response &res, std::string &content_type, std::string &boundary) const { - if (req.ranges.size() > 1 && res.status == StatusCode::PartialContent_206) { + if (req.ranges.size() > 1) { auto it = res.headers.find("Content-Type"); if (it != res.headers.end()) { content_type = it->second; @@ -7025,7 +6521,7 @@ inline void Server::apply_ranges(const Request &req, Response &res, if (res.body.empty()) { if (res.content_length_ > 0) { size_t length = 0; - if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) { + if (req.ranges.empty()) { length = res.content_length_; } else if (req.ranges.size() == 1) { auto offset_and_length = detail::get_range_offset_and_length( @@ -7054,7 +6550,7 @@ inline void Server::apply_ranges(const Request &req, Response &res, } } } else { - if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) { + if (req.ranges.empty()) { ; } else if (req.ranges.size() == 1) { auto offset_and_length = @@ -7125,9 +6621,7 @@ inline bool Server::dispatch_request_for_content_reader( } inline bool -Server::process_request(Stream &strm, const std::string &remote_addr, - int remote_port, const std::string &local_addr, - int local_port, bool close_connection, +Server::process_request(Stream &strm, bool close_connection, bool &connection_closed, const std::function &setup_request) { std::array buf{}; @@ -7181,13 +6675,11 @@ Server::process_request(Stream &strm, const std::string &remote_addr, connection_closed = true; } - req.remote_addr = remote_addr; - req.remote_port = remote_port; + strm.get_remote_ip_and_port(req.remote_addr, req.remote_port); req.set_header("REMOTE_ADDR", req.remote_addr); req.set_header("REMOTE_PORT", std::to_string(req.remote_port)); - req.local_addr = local_addr; - req.local_port = local_port; + strm.get_local_ip_and_port(req.local_addr, req.local_port); req.set_header("LOCAL_ADDR", req.local_addr); req.set_header("LOCAL_PORT", std::to_string(req.local_port)); @@ -7209,20 +6701,13 @@ Server::process_request(Stream &strm, const std::string &remote_addr, switch (status) { case StatusCode::Continue_100: case StatusCode::ExpectationFailed_417: - detail::write_response_line(strm, status); - strm.write("\r\n"); + strm.write_format("HTTP/1.1 %d %s\r\n\r\n", status, + status_message(status)); break; - default: - connection_closed = true; - return write_response(strm, true, req, res); + default: return write_response(strm, close_connection, req, res); } } - // Setup `is_connection_closed` method - req.is_connection_closed = [&]() { - return !detail::is_socket_alive(strm.socket()); - }; - // Routing auto routed = false; #ifdef CPPHTTPLIB_NO_EXCEPTIONS @@ -7265,32 +6750,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr, : StatusCode::PartialContent_206; } - // Serve file content by using a content provider - if (!res.file_content_path_.empty()) { - const auto &path = res.file_content_path_; - auto mm = std::make_shared(path.c_str()); - if (!mm->is_open()) { - res.body.clear(); - res.content_length_ = 0; - res.content_provider_ = nullptr; - res.status = StatusCode::NotFound_404; - return write_response(strm, close_connection, req, res); - } - - auto content_type = res.file_content_content_type_; - if (content_type.empty()) { - content_type = detail::find_content_type( - path, file_extension_and_mimetype_map_, default_file_mimetype_); - } - - res.set_content_provider( - mm->size(), content_type, - [mm](size_t offset, size_t length, DataSink &sink) -> bool { - sink.write(mm->data() + offset, length); - return true; - }); - } - if (detail::range_error(req, res)) { res.body.clear(); res.content_length_ = 0; @@ -7310,21 +6769,12 @@ Server::process_request(Stream &strm, const std::string &remote_addr, inline bool Server::is_valid() const { return true; } inline bool Server::process_and_close_socket(socket_t sock) { - std::string remote_addr; - int remote_port = 0; - detail::get_remote_ip_and_port(sock, remote_addr, remote_port); - - std::string local_addr; - int local_port = 0; - detail::get_local_ip_and_port(sock, local_addr, local_port); - auto ret = detail::process_server_socket( svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, - [&](Stream &strm, bool close_connection, bool &connection_closed) { - return process_request(strm, remote_addr, remote_port, local_addr, - local_port, close_connection, connection_closed, + [this](Stream &strm, bool close_connection, bool &connection_closed) { + return process_request(strm, close_connection, connection_closed, nullptr); }); @@ -7343,8 +6793,8 @@ inline ClientImpl::ClientImpl(const std::string &host, int port) inline ClientImpl::ClientImpl(const std::string &host, int port, const std::string &client_cert_path, const std::string &client_key_path) - : host_(detail::escape_abstract_namespace_unix_domain(host)), port_(port), - host_and_port_(adjust_host_string(host_) + ":" + std::to_string(port)), + : host_(host), port_(port), + host_and_port_(adjust_host_string(host) + ":" + std::to_string(port)), client_cert_path_(client_cert_path), client_key_path_(client_key_path) {} inline ClientImpl::~ClientImpl() { @@ -7375,7 +6825,6 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) { url_encode_ = rhs.url_encode_; address_family_ = rhs.address_family_; tcp_nodelay_ = rhs.tcp_nodelay_; - ipv6_v6only_ = rhs.ipv6_v6only_; socket_options_ = rhs.socket_options_; compress_ = rhs.compress_; decompress_ = rhs.decompress_; @@ -7396,8 +6845,6 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) { #endif #ifdef CPPHTTPLIB_OPENSSL_SUPPORT server_certificate_verification_ = rhs.server_certificate_verification_; - server_hostname_verification_ = rhs.server_hostname_verification_; - server_certificate_verifier_ = rhs.server_certificate_verifier_; #endif logger_ = rhs.logger_; } @@ -7406,9 +6853,9 @@ inline socket_t ClientImpl::create_client_socket(Error &error) const { if (!proxy_host_.empty() && proxy_port_ != -1) { return detail::create_client_socket( proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_, - ipv6_v6only_, socket_options_, connection_timeout_sec_, - connection_timeout_usec_, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, interface_, error); + socket_options_, connection_timeout_sec_, connection_timeout_usec_, + read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, + write_timeout_usec_, interface_, error); } // Check is custom IP specified for host_ @@ -7417,10 +6864,10 @@ inline socket_t ClientImpl::create_client_socket(Error &error) const { if (it != addr_map_.end()) { ip = it->second; } return detail::create_client_socket( - host_, ip, port_, address_family_, tcp_nodelay_, ipv6_v6only_, - socket_options_, connection_timeout_sec_, connection_timeout_usec_, - read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, interface_, error); + host_, ip, port_, address_family_, tcp_nodelay_, socket_options_, + connection_timeout_sec_, connection_timeout_usec_, read_timeout_sec_, + read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, interface_, + error); } inline bool ClientImpl::create_and_connect_socket(Socket &socket, @@ -7509,18 +6956,6 @@ inline bool ClientImpl::send(Request &req, Response &res, Error &error) { return ret; } -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline bool ClientImpl::is_ssl_peer_could_be_closed(SSL *ssl) const { - detail::set_nonblocking(socket_.sock, true); - auto se = detail::scope_exit( - [&]() { detail::set_nonblocking(socket_.sock, false); }); - - char buf[1]; - return !SSL_peek(ssl, buf, 1) && - SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN; -} -#endif - inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { { std::lock_guard guard(socket_mutex_); @@ -7532,13 +6967,6 @@ inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { auto is_alive = false; if (socket_.is_open()) { is_alive = detail::is_socket_alive(socket_.sock); - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (is_alive && is_ssl()) { - if (is_ssl_peer_could_be_closed(socket_.ssl)) { is_alive = false; } - } -#endif - if (!is_alive) { // Attempt to avoid sigpipe by shutting down nongracefully if it seems // like the other side has already closed the connection Also, there @@ -7716,7 +7144,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) { if (location.empty()) { return false; } const static std::regex re( - R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)"); + R"((?:(https?):)?(?://(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)"); std::smatch m; if (!std::regex_match(location, m, re)) { return false; } @@ -7815,26 +7243,12 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, if (!req.has_header("Accept")) { req.set_header("Accept", "*/*"); } - if (!req.content_receiver) { - if (!req.has_header("Accept-Encoding")) { - std::string accept_encoding; -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - accept_encoding = "br"; -#endif -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - if (!accept_encoding.empty()) { accept_encoding += ", "; } - accept_encoding += "gzip, deflate"; -#endif - req.set_header("Accept-Encoding", accept_encoding); - } - #ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT - if (!req.has_header("User-Agent")) { - auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION; - req.set_header("User-Agent", agent); - } + if (!req.has_header("User-Agent")) { + auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION; + req.set_header("User-Agent", agent); + } #endif - }; if (req.body.empty()) { if (req.content_provider_) { @@ -7894,14 +7308,8 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, { detail::BufferStream bstrm; - const auto &path_with_query = - req.params.empty() ? req.path - : append_query_params(req.path, req.params); - - const auto &path = - url_encode_ ? detail::encode_url(path_with_query) : path_with_query; - - detail::write_request_line(bstrm, req.method, path); + const auto &path = url_encode_ ? detail::encode_url(req.path) : req.path; + bstrm.write_format("%s %s HTTP/1.1\r\n", req.method.c_str(), path.c_str()); header_writer_(bstrm, req.headers); @@ -8009,12 +7417,11 @@ inline Result ClientImpl::send_with_content_provider( const std::string &method, const std::string &path, const Headers &headers, const char *body, size_t content_length, ContentProvider content_provider, ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type, Progress progress) { + const std::string &content_type) { Request req; req.method = method; req.headers = headers; req.path = path; - req.progress = progress; auto error = Error::Success; @@ -8041,7 +7448,9 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, if (is_ssl()) { auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1; if (!is_proxy_enabled) { - if (is_ssl_peer_could_be_closed(socket_.ssl)) { + char buf[1]; + if (SSL_peek(socket_.ssl, buf, 1) == 0 && + SSL_get_error(socket_.ssl, 0) == SSL_ERROR_ZERO_RETURN) { error = Error::SSLPeerCouldBeClosed_; return false; } @@ -8059,9 +7468,7 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, // Body if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" && req.method != "CONNECT") { - auto redirect = 300 < res.status && res.status < 400 && - res.status != StatusCode::NotModified_304 && - follow_location_; + auto redirect = 300 < res.status && res.status < 400 && follow_location_; if (req.response_handler && !redirect) { if (!req.response_handler(res)) { @@ -8082,7 +7489,9 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, : static_cast( [&](const char *buf, size_t n, uint64_t /*off*/, uint64_t /*len*/) { - assert(res.body.size() + n <= res.body.max_size()); + if (res.body.size() + n > res.body.max_size()) { + return false; + } res.body.append(buf, n); return true; }); @@ -8094,25 +7503,12 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, return ret; }; - if (res.has_header("Content-Length")) { - if (!req.content_receiver) { - auto len = res.get_header_value_u64("Content-Length"); - if (len > res.body.max_size()) { - error = Error::Read; - return false; - } - res.body.reserve(static_cast(len)); - } - } - - if (res.status != StatusCode::NotModified_304) { - int dummy_status; - if (!detail::read_content(strm, res, (std::numeric_limits::max)(), - dummy_status, std::move(progress), - std::move(out), decompress_)) { - if (error != Error::Canceled) { error = Error::Read; } - return false; - } + int dummy_status; + if (!detail::read_content(strm, res, (std::numeric_limits::max)(), + dummy_status, std::move(progress), std::move(out), + decompress_)) { + if (error != Error::Canceled) { error = Error::Read; } + return false; } } @@ -8321,22 +7717,14 @@ inline Result ClientImpl::Post(const std::string &path, inline Result ClientImpl::Post(const std::string &path, const char *body, size_t content_length, const std::string &content_type) { - return Post(path, Headers(), body, content_length, content_type, nullptr); + return Post(path, Headers(), body, content_length, content_type); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return send_with_content_provider("POST", path, headers, body, content_length, - nullptr, nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("POST", path, headers, body, content_length, - nullptr, nullptr, content_type, progress); + nullptr, nullptr, content_type); } inline Result ClientImpl::Post(const std::string &path, const std::string &body, @@ -8344,27 +7732,12 @@ inline Result ClientImpl::Post(const std::string &path, const std::string &body, return Post(path, Headers(), body, content_type); } -inline Result ClientImpl::Post(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return Post(path, Headers(), body, content_type, progress); -} - inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return send_with_content_provider("POST", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("POST", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - progress); + body.size(), nullptr, nullptr, + content_type); } inline Result ClientImpl::Post(const std::string &path, const Params ¶ms) { @@ -8390,15 +7763,14 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const std::string &content_type) { return send_with_content_provider("POST", path, headers, nullptr, content_length, std::move(content_provider), - nullptr, content_type, nullptr); + nullptr, content_type); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, ContentProviderWithoutLength content_provider, const std::string &content_type) { return send_with_content_provider("POST", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type, - nullptr); + std::move(content_provider), content_type); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, @@ -8407,13 +7779,6 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers, return Post(path, headers, query, "application/x-www-form-urlencoded"); } -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - auto query = detail::params_to_query_str(params); - return Post(path, headers, query, "application/x-www-form-urlencoded", - progress); -} - inline Result ClientImpl::Post(const std::string &path, const MultipartFormDataItems &items) { return Post(path, Headers(), items); @@ -8451,7 +7816,7 @@ ClientImpl::Post(const std::string &path, const Headers &headers, return send_with_content_provider( "POST", path, headers, nullptr, 0, nullptr, get_multipart_content_provider(boundary, items, provider_items), - content_type, nullptr); + content_type); } inline Result ClientImpl::Put(const std::string &path) { @@ -8468,15 +7833,7 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, body, content_length, - nullptr, nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("PUT", path, headers, body, content_length, - nullptr, nullptr, content_type, progress); + nullptr, nullptr, content_type); } inline Result ClientImpl::Put(const std::string &path, const std::string &body, @@ -8484,27 +7841,12 @@ inline Result ClientImpl::Put(const std::string &path, const std::string &body, return Put(path, Headers(), body, content_type); } -inline Result ClientImpl::Put(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return Put(path, Headers(), body, content_type, progress); -} - inline Result ClientImpl::Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - nullptr); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("PUT", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - progress); + body.size(), nullptr, nullptr, + content_type); } inline Result ClientImpl::Put(const std::string &path, size_t content_length, @@ -8526,15 +7868,14 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, nullptr, content_length, std::move(content_provider), - nullptr, content_type, nullptr); + nullptr, content_type); } inline Result ClientImpl::Put(const std::string &path, const Headers &headers, ContentProviderWithoutLength content_provider, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type, - nullptr); + std::move(content_provider), content_type); } inline Result ClientImpl::Put(const std::string &path, const Params ¶ms) { @@ -8547,13 +7888,6 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers, return Put(path, headers, query, "application/x-www-form-urlencoded"); } -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - auto query = detail::params_to_query_str(params); - return Put(path, headers, query, "application/x-www-form-urlencoded", - progress); -} - inline Result ClientImpl::Put(const std::string &path, const MultipartFormDataItems &items) { return Put(path, Headers(), items); @@ -8591,7 +7925,7 @@ ClientImpl::Put(const std::string &path, const Headers &headers, return send_with_content_provider( "PUT", path, headers, nullptr, 0, nullptr, get_multipart_content_provider(boundary, items, provider_items), - content_type, nullptr); + content_type); } inline Result ClientImpl::Patch(const std::string &path) { return Patch(path, std::string(), std::string()); @@ -8603,26 +7937,12 @@ inline Result ClientImpl::Patch(const std::string &path, const char *body, return Patch(path, Headers(), body, content_length, content_type); } -inline Result ClientImpl::Patch(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return Patch(path, Headers(), body, content_length, content_type, progress); -} - inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { - return Patch(path, headers, body, content_length, content_type, nullptr); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { return send_with_content_provider("PATCH", path, headers, body, content_length, nullptr, nullptr, - content_type, progress); + content_type); } inline Result ClientImpl::Patch(const std::string &path, @@ -8631,26 +7951,12 @@ inline Result ClientImpl::Patch(const std::string &path, return Patch(path, Headers(), body, content_type); } -inline Result ClientImpl::Patch(const std::string &path, - const std::string &body, - const std::string &content_type, - Progress progress) { - return Patch(path, Headers(), body, content_type, progress); -} - inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { - return Patch(path, headers, body, content_type, nullptr); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { return send_with_content_provider("PATCH", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - progress); + body.size(), nullptr, nullptr, + content_type); } inline Result ClientImpl::Patch(const std::string &path, size_t content_length, @@ -8672,15 +7978,14 @@ inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, const std::string &content_type) { return send_with_content_provider("PATCH", path, headers, nullptr, content_length, std::move(content_provider), - nullptr, content_type, nullptr); + nullptr, content_type); } inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, ContentProviderWithoutLength content_provider, const std::string &content_type) { return send_with_content_provider("PATCH", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type, - nullptr); + std::move(content_provider), content_type); } inline Result ClientImpl::Delete(const std::string &path) { @@ -8698,30 +8003,14 @@ inline Result ClientImpl::Delete(const std::string &path, const char *body, return Delete(path, Headers(), body, content_length, content_type); } -inline Result ClientImpl::Delete(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return Delete(path, Headers(), body, content_length, content_type, progress); -} - inline Result ClientImpl::Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { - return Delete(path, headers, body, content_length, content_type, nullptr); -} - -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { Request req; req.method = "DELETE"; req.headers = headers; req.path = path; - req.progress = progress; if (!content_type.empty()) { req.set_header("Content-Type", content_type); } req.body.assign(body, content_length); @@ -8735,14 +8024,6 @@ inline Result ClientImpl::Delete(const std::string &path, return Delete(path, Headers(), body.data(), body.size(), content_type); } -inline Result ClientImpl::Delete(const std::string &path, - const std::string &body, - const std::string &content_type, - Progress progress) { - return Delete(path, Headers(), body.data(), body.size(), content_type, - progress); -} - inline Result ClientImpl::Delete(const std::string &path, const Headers &headers, const std::string &body, @@ -8750,15 +8031,6 @@ inline Result ClientImpl::Delete(const std::string &path, return Delete(path, headers, body.data(), body.size(), content_type); } -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return Delete(path, headers, body.data(), body.size(), content_type, - progress); -} - inline Result ClientImpl::Options(const std::string &path) { return Options(path, Headers()); } @@ -8866,8 +8138,6 @@ inline void ClientImpl::set_address_family(int family) { inline void ClientImpl::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; } -inline void ClientImpl::set_ipv6_v6only(bool on) { ipv6_v6only_ = on; } - inline void ClientImpl::set_socket_options(SocketOptions socket_options) { socket_options_ = std::move(socket_options); } @@ -8917,11 +8187,13 @@ inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) { inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, std::size_t size) const { auto mem = BIO_new_mem_buf(ca_cert, static_cast(size)); - auto se = detail::scope_exit([&] { BIO_free_all(mem); }); if (!mem) { return nullptr; } auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr); - if (!inf) { return nullptr; } + if (!inf) { + BIO_free_all(mem); + return nullptr; + } auto cts = X509_STORE_new(); if (cts) { @@ -8935,21 +8207,13 @@ inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, } sk_X509_INFO_pop_free(inf, X509_INFO_free); + BIO_free_all(mem); return cts; } inline void ClientImpl::enable_server_certificate_verification(bool enabled) { server_certificate_verification_ = enabled; } - -inline void ClientImpl::enable_server_hostname_verification(bool enabled) { - server_hostname_verification_ = enabled; -} - -inline void ClientImpl::set_server_certificate_verifier( - std::function verifier) { - server_certificate_verifier_ = verifier; -} #endif inline void ClientImpl::set_logger(Logger logger) { @@ -8993,30 +8257,13 @@ inline SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex, return ssl; } -inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock, +inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, bool shutdown_gracefully) { // sometimes we may want to skip this to try to avoid SIGPIPE if we know // the remote has closed the network connection // Note that it is not always possible to avoid SIGPIPE, this is merely a // best-efforts. - if (shutdown_gracefully) { -#ifdef _WIN32 - (void)(sock); - SSL_shutdown(ssl); -#else - timeval tv; - tv.tv_sec = 1; - tv.tv_usec = 0; - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - reinterpret_cast(&tv), sizeof(tv)); - - auto ret = SSL_shutdown(ssl); - while (ret == 0) { - std::this_thread::sleep_for(std::chrono::milliseconds{100}); - ret = SSL_shutdown(ssl); - } -#endif - } + if (shutdown_gracefully) { SSL_shutdown(ssl); } std::lock_guard guard(ctx_mutex); SSL_free(ssl); @@ -9119,7 +8366,7 @@ inline ssize_t SSLSocketStream::read(char *ptr, size_t size) { if (SSL_pending(ssl_) > 0) { return SSL_read(ssl_, ptr, static_cast(size)); } else if (is_readable()) { - std::this_thread::sleep_for(std::chrono::microseconds{10}); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); ret = SSL_read(ssl_, ptr, static_cast(size)); if (ret >= 0) { return ret; } err = SSL_get_error(ssl_, ret); @@ -9150,7 +8397,7 @@ inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) { while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) { #endif if (is_writable()) { - std::this_thread::sleep_for(std::chrono::microseconds{10}); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); ret = SSL_write(ssl_, ptr, static_cast(handle_size)); if (ret >= 0) { return ret; } err = SSL_get_error(ssl_, ret); @@ -9192,7 +8439,7 @@ inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path, SSL_OP_NO_COMPRESSION | SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); + SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION); if (private_key_password != nullptr && (private_key_password[0] != '\0')) { SSL_CTX_set_default_passwd_cb_userdata( @@ -9202,8 +8449,7 @@ inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path, if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 || SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != - 1 || - SSL_CTX_check_private_key(ctx_) != 1) { + 1) { SSL_CTX_free(ctx_); ctx_ = nullptr; } else if (client_ca_cert_file_path || client_ca_cert_dir_path) { @@ -9225,7 +8471,7 @@ inline SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key, SSL_OP_NO_COMPRESSION | SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); + SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION); if (SSL_CTX_use_certificate(ctx_, cert) != 1 || SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) { @@ -9259,19 +8505,6 @@ inline bool SSLServer::is_valid() const { return ctx_; } inline SSL_CTX *SSLServer::ssl_context() const { return ctx_; } -inline void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store) { - - std::lock_guard guard(ctx_mutex_); - - SSL_CTX_use_certificate(ctx_, cert); - SSL_CTX_use_PrivateKey(ctx_, private_key); - - if (client_ca_cert_store != nullptr) { - SSL_CTX_set_cert_store(ctx_, client_ca_cert_store); - } -} - inline bool SSLServer::process_and_close_socket(socket_t sock) { auto ssl = detail::ssl_new( sock, ctx_, ctx_mutex_, @@ -9283,29 +8516,20 @@ inline bool SSLServer::process_and_close_socket(socket_t sock) { auto ret = false; if (ssl) { - std::string remote_addr; - int remote_port = 0; - detail::get_remote_ip_and_port(sock, remote_addr, remote_port); - - std::string local_addr; - int local_port = 0; - detail::get_local_ip_and_port(sock, local_addr, local_port); - ret = detail::process_server_socket_ssl( svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, - [&](Stream &strm, bool close_connection, bool &connection_closed) { - return process_request(strm, remote_addr, remote_port, local_addr, - local_port, close_connection, - connection_closed, + [this, ssl](Stream &strm, bool close_connection, + bool &connection_closed) { + return process_request(strm, close_connection, connection_closed, [&](Request &req) { req.ssl = ssl; }); }); // Shutdown gracefully if the result seemed successful, non-gracefully if // the connection appeared to be closed. const bool shutdown_gracefully = ret; - detail::ssl_delete(ctx_mutex_, ssl, sock, shutdown_gracefully); + detail::ssl_delete(ctx_mutex_, ssl, shutdown_gracefully); } detail::shutdown_socket(sock); @@ -9327,8 +8551,6 @@ inline SSLClient::SSLClient(const std::string &host, int port, : ClientImpl(host, port, client_cert_path, client_key_path) { ctx_ = SSL_CTX_new(TLS_client_method()); - SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); - detail::split(&host_[0], &host_[host_.size()], '.', [&](const char *b, const char *e) { host_components_.emplace_back(b, e); @@ -9536,47 +8758,36 @@ inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) { } if (server_certificate_verification_) { - if (server_certificate_verifier_) { - if (!server_certificate_verifier_(ssl2)) { - error = Error::SSLServerVerification; - return false; - } - } else { - verify_result_ = SSL_get_verify_result(ssl2); + verify_result_ = SSL_get_verify_result(ssl2); - if (verify_result_ != X509_V_OK) { - error = Error::SSLServerVerification; - return false; - } - - auto server_cert = SSL_get1_peer_certificate(ssl2); - auto se = detail::scope_exit([&] { X509_free(server_cert); }); - - if (server_cert == nullptr) { - error = Error::SSLServerVerification; - return false; - } - - if (server_hostname_verification_) { - if (!verify_host(server_cert)) { - error = Error::SSLServerHostnameVerification; - return false; - } - } + if (verify_result_ != X509_V_OK) { + error = Error::SSLServerVerification; + return false; } + + auto server_cert = SSL_get1_peer_certificate(ssl2); + + if (server_cert == nullptr) { + error = Error::SSLServerVerification; + return false; + } + + if (!verify_host(server_cert)) { + X509_free(server_cert); + error = Error::SSLServerVerification; + return false; + } + X509_free(server_cert); } return true; }, [&](SSL *ssl2) { -#if defined(OPENSSL_IS_BORINGSSL) - SSL_set_tlsext_host_name(ssl2, host_.c_str()); -#else // NOTE: Direct call instead of using the OpenSSL macro to suppress // -Wold-style-cast warning + // SSL_set_tlsext_host_name(ssl2, host_.c_str()); SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name, static_cast(const_cast(host_.c_str()))); -#endif return true; }); @@ -9601,8 +8812,7 @@ inline void SSLClient::shutdown_ssl_impl(Socket &socket, return; } if (socket.ssl) { - detail::ssl_delete(ctx_mutex_, socket.ssl, socket.sock, - shutdown_gracefully); + detail::ssl_delete(ctx_mutex_, socket.ssl, shutdown_gracefully); socket.ssl = nullptr; } assert(socket.ssl == nullptr); @@ -9651,8 +8861,8 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const { auto type = GEN_DNS; - struct in6_addr addr6{}; - struct in_addr addr{}; + struct in6_addr addr6 {}; + struct in_addr addr {}; size_t addr_len = 0; #ifndef __MINGW32__ @@ -9755,7 +8965,7 @@ inline Client::Client(const std::string &scheme_host_port, const std::string &client_cert_path, const std::string &client_key_path) { const static std::regex re( - R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)"); + R"((?:([a-z]+):\/\/)?(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)"); std::smatch m; if (std::regex_match(scheme_host_port, m, re)) { @@ -9792,12 +9002,10 @@ inline Client::Client(const std::string &scheme_host_port, client_key_path); } } else { - // NOTE: Update TEST(UniversalClientImplTest, Ipv6LiteralAddress) - // if port param below changes. cli_ = detail::make_unique(scheme_host_port, 80, client_cert_path, client_key_path); } -} // namespace detail +} inline Client::Client(const std::string &host, int port) : cli_(detail::make_unique(host, port)) {} @@ -9903,30 +9111,15 @@ inline Result Client::Post(const std::string &path, const Headers &headers, const std::string &content_type) { return cli_->Post(path, headers, body, content_length, content_type); } -inline Result Client::Post(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress) { - return cli_->Post(path, headers, body, content_length, content_type, - progress); -} inline Result Client::Post(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Post(path, body, content_type); } -inline Result Client::Post(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Post(path, body, content_type, progress); -} inline Result Client::Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Post(path, headers, body, content_type); } -inline Result Client::Post(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Post(path, headers, body, content_type, progress); -} inline Result Client::Post(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type) { @@ -9957,10 +9150,6 @@ inline Result Client::Post(const std::string &path, const Headers &headers, const Params ¶ms) { return cli_->Post(path, headers, params); } -inline Result Client::Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - return cli_->Post(path, headers, params, progress); -} inline Result Client::Post(const std::string &path, const MultipartFormDataItems &items) { return cli_->Post(path, items); @@ -9991,29 +9180,15 @@ inline Result Client::Put(const std::string &path, const Headers &headers, const std::string &content_type) { return cli_->Put(path, headers, body, content_length, content_type); } -inline Result Client::Put(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress) { - return cli_->Put(path, headers, body, content_length, content_type, progress); -} inline Result Client::Put(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Put(path, body, content_type); } -inline Result Client::Put(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Put(path, body, content_type, progress); -} inline Result Client::Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Put(path, headers, body, content_type); } -inline Result Client::Put(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Put(path, headers, body, content_type, progress); -} inline Result Client::Put(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type) { @@ -10044,10 +9219,6 @@ inline Result Client::Put(const std::string &path, const Headers &headers, const Params ¶ms) { return cli_->Put(path, headers, params); } -inline Result Client::Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - return cli_->Put(path, headers, params, progress); -} inline Result Client::Put(const std::string &path, const MultipartFormDataItems &items) { return cli_->Put(path, items); @@ -10075,44 +9246,20 @@ inline Result Client::Patch(const std::string &path, const char *body, const std::string &content_type) { return cli_->Patch(path, body, content_length, content_type); } -inline Result Client::Patch(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, body, content_length, content_type, progress); -} inline Result Client::Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return cli_->Patch(path, headers, body, content_length, content_type); } -inline Result Client::Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, headers, body, content_length, content_type, - progress); -} inline Result Client::Patch(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Patch(path, body, content_type); } -inline Result Client::Patch(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, body, content_type, progress); -} inline Result Client::Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Patch(path, headers, body, content_type); } -inline Result Client::Patch(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, headers, body, content_type, progress); -} inline Result Client::Patch(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type) { @@ -10147,44 +9294,20 @@ inline Result Client::Delete(const std::string &path, const char *body, const std::string &content_type) { return cli_->Delete(path, body, content_length, content_type); } -inline Result Client::Delete(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, body, content_length, content_type, progress); -} inline Result Client::Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return cli_->Delete(path, headers, body, content_length, content_type); } -inline Result Client::Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, headers, body, content_length, content_type, - progress); -} inline Result Client::Delete(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Delete(path, body, content_type); } -inline Result Client::Delete(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, body, content_type, progress); -} inline Result Client::Delete(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Delete(path, headers, body, content_type); } -inline Result Client::Delete(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, headers, body, content_type, progress); -} inline Result Client::Options(const std::string &path) { return cli_->Options(path); } @@ -10294,15 +9417,6 @@ inline void Client::set_proxy_digest_auth(const std::string &username, inline void Client::enable_server_certificate_verification(bool enabled) { cli_->enable_server_certificate_verification(enabled); } - -inline void Client::enable_server_hostname_verification(bool enabled) { - cli_->enable_server_hostname_verification(enabled); -} - -inline void Client::set_server_certificate_verifier( - std::function verifier) { - cli_->set_server_certificate_verifier(verifier); -} #endif inline void Client::set_logger(Logger logger) { diff --git a/examples/server/public_legacy/colorthemes.css b/examples/server/public/colorthemes.css similarity index 100% rename from examples/server/public_legacy/colorthemes.css rename to examples/server/public/colorthemes.css diff --git a/examples/server/public_legacy/completion.js b/examples/server/public/completion.js similarity index 96% rename from examples/server/public_legacy/completion.js rename to examples/server/public/completion.js index 30df7c2fa..36818f764 100644 --- a/examples/server/public_legacy/completion.js +++ b/examples/server/public/completion.js @@ -29,7 +29,7 @@ export async function* llama(prompt, params = {}, config = {}) { const completionParams = { ...paramDefaults, ...params, prompt }; - const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, { + const response = await fetch(`${api_url}/completion`, { method: 'POST', body: JSON.stringify(completionParams), headers: { @@ -78,12 +78,7 @@ export async function* llama(prompt, params = {}, config = {}) { for (const line of lines) { const match = regex.exec(line); if (match) { - result[match[1]] = match[2]; - if (result.data === '[DONE]') { - cont = false; - break; - } - + result[match[1]] = match[2] // since we know this is llama.cpp, let's just decode the json in data if (result.data) { result.data = JSON.parse(result.data); diff --git a/examples/server/public_legacy/favicon.ico b/examples/server/public/favicon.ico similarity index 100% rename from examples/server/public_legacy/favicon.ico rename to examples/server/public/favicon.ico diff --git a/examples/server/public_legacy/index-new.html b/examples/server/public/index-new.html similarity index 95% rename from examples/server/public_legacy/index-new.html rename to examples/server/public/index-new.html index cbfbbdf28..c87dd8f1e 100644 --- a/examples/server/public_legacy/index-new.html +++ b/examples/server/public/index-new.html @@ -39,15 +39,11 @@ temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower repeat_last_n: 0, // 0 = disable penalty, -1 = context size repeat_penalty: 1.0, // 1.0 = disabled - dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well - dry_base: 1.75, // 0.0 = disabled - dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well - dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + penalize_nl: false, // true only useful for infinite completion top_k: 0, // <= 0 to use vocab size top_p: 1.0, // 1.0 = disabled min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 - xtc_probability: 0.0, // 0 = disabled; - xtc_threshold: 0.1, // > 0.5 disables XTC; + tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled frequency_penalty: 0.0, // 0.0 = disabled @@ -835,16 +831,11 @@ return html`
${IntField({ label: "Top-K", title: "Limits the selection of the next token to the K most probable tokens. 1 means no randomness = greedy sampling. If set to 0, it means the entire vocabulary size is considered.", max: 100, min: 0, step: 1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Penalize Last N", title: "The last n tokens that are taken into account to penalise repetitions. A value of 0 means that this function is deactivated and -1 means that the entire size of the context is taken into account.", max: 2048, min: 0, step: 16, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${FloatField({ label: "Presence Penalty", title: "A penalty that is applied if certain tokens appear repeatedly in the generated text. A higher value leads to fewer repetitions.", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} - ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} + ${FloatField({ label: "Presence Penalty", title: "A penalty that is applied if certain tokens appear repeatedly in the generated text. A higher value leads to fewer repetitions.", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} + ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} + ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} - ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} - ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} - ${FloatField({ label: "DRY Penalty Multiplier", title: "Set the DRY repetition penalty multiplier. Default is 0.0, which disables DRY.", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })} - ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })} - ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })} - ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
@@ -1141,15 +1132,12 @@ document.addEventListener('DOMContentLoaded', (event) => { const snapSettings = { temperature: { snapValue: 1.0, snapRangeMultiplier: 6 }, min_p: { snapValue: 0.05, snapRangeMultiplier: 2 }, - xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 }, - xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 }, top_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, + tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 }, typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 }, presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 }, frequency_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 }, - dry_multiplier: { snapValue: 0.0, snapRangeMultiplier: 4 }, - dry_base: { snapValue: 1.75, snapRangeMultiplier: 4 }, }; // add an event listener for each slider Object.keys(snapSettings).forEach(sliderName => { diff --git a/examples/server/public_legacy/index.html b/examples/server/public/index.html similarity index 82% rename from examples/server/public_legacy/index.html rename to examples/server/public/index.html index 75f39330a..48628a960 100644 --- a/examples/server/public_legacy/index.html +++ b/examples/server/public/index.html @@ -1,4 +1,5 @@ + @@ -131,20 +132,12 @@ align-items: stretch; } - .message-controls { + .right { display: flex; + flex-direction: row; + gap: 0.5em; justify-content: flex-end; } - .message-controls > div:nth-child(2) { - display: flex; - flex-direction: column; - gap: 0.5em; - } - .message-controls > div:nth-child(2) > div { - display: flex; - margin-left: auto; - gap: 0.5em; - } fieldset { border: none; @@ -283,7 +276,6 @@ import { llama } from './completion.js'; import { SchemaConverter } from './json-schema-to-grammar.mjs'; - let selected_image = false; var slot_id = -1; @@ -303,15 +295,11 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well - dry_base: 1.75, // 0.0 = disabled - dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well - dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled - xtc_probability: 0.0, // 0 = disabled; - xtc_threshold: 0.1, // > 0.5 disables XTC; + tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled frequency_penalty: 0.0, // 0.0 = disabled @@ -459,9 +447,6 @@ /* END: Support for storing prompt templates and parameters in browsers LocalStorage */ - const tts = window.speechSynthesis; - const ttsVoice = signal(null) - const llamaStats = signal(null) const controller = signal(null) @@ -611,51 +596,8 @@ }); } - const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; - const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null; function MessageInput() { - const message = useSignal(""); - - const talkActive = useSignal(false); - const sendOnTalk = useSignal(false); - const talkStop = (e) => { - if (e) e.preventDefault(); - - talkActive.value = false; - talkRecognition?.stop(); - } - const talk = (e) => { - e.preventDefault(); - - if (talkRecognition) - talkRecognition.start(); - else - alert("Speech recognition is not supported by this browser."); - } - if(talkRecognition) { - talkRecognition.onstart = () => { - talkActive.value = true; - } - talkRecognition.onresult = (e) => { - if (event.results.length > 0) { - message.value = event.results[0][0].transcript; - if (sendOnTalk.value) { - submit(e); - } - } - } - talkRecognition.onspeechend = () => { - talkStop(); - } - } - - const ttsVoices = useSignal(tts?.getVoices() || []); - const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default)); - if (tts) { - tts.onvoiceschanged = () => { - ttsVoices.value = tts.getVoices(); - } - } + const message = useSignal("") const submit = (e) => { stop(e); @@ -682,45 +624,11 @@ value="${message}" /> -
-
-
-
- - - - -
- -
- { - e.preventDefault(); - alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`); - }}>[?] - - -
-
+
+ + + +
` @@ -751,86 +659,26 @@ } }, [messages]) - const ttsChatLineActiveIx = useSignal(undefined); - const ttsChatLine = (e, ix, msg) => { - if (e) e.preventDefault(); - - if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return; - - const ttsVoices = tts.getVoices(); - const voice = ttsVoices.find(v => v.name === ttsVoice.value); - if (!voice) return; - - if (ttsChatLineActiveIx.value !== undefined) { - tts.cancel(); - if (ttsChatLineActiveIx.value === ix) { - ttsChatLineActiveIx.value = undefined; - return; - } - } - - ttsChatLineActiveIx.value = ix; - let ttsUtter = new SpeechSynthesisUtterance(msg); - ttsUtter.voice = voice; - ttsUtter.onend = e => { - ttsChatLineActiveIx.value = undefined; - }; - tts.speak(ttsUtter); - } - const isCompletionMode = session.value.type === 'completion' - - // Try play the last bot message - const lastCharChatLinesIxs = useSignal([]); - const lastCharChatLinesIxsOld = useSignal([]); - useEffect(() => { - if ( - !isCompletionMode - && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length - && !generating.value - ) { - const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1]; - if (ix !== undefined) { - const msg = messages[ix]; - ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg); - } - - lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value); - } - }, [generating.value]); - const chatLine = ([user, data], index) => { let message - const isArrayMessage = Array.isArray(data); - const text = isArrayMessage ? - data.map(msg => msg.content).join('') : - data; + const isArrayMessage = Array.isArray(data) if (params.value.n_probs > 0 && isArrayMessage) { message = html`<${Probabilities} data=${data} />` } else { + const text = isArrayMessage ? + data.map(msg => msg.content).join('') : + data; message = isCompletionMode ? text : html`<${Markdownish} text=${template(text)} />` } - - const fromBot = user && user === '{{char}}'; - if (fromBot && !lastCharChatLinesIxs.value.includes(index)) - lastCharChatLinesIxs.value.push(index); - if (user) { - return html` -
-

${template(user)}: ${message}

- ${ - fromBot && ttsVoice.value - && html`
` - } -
- `; + return html`

${template(user)}: ${message}

` } else { return isCompletionMode ? html`${message}` : - html`

${message}

` + html`

${message}

` } }; @@ -1005,6 +853,7 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} + ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} @@ -1012,15 +861,10 @@
More options
+ ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} - ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })} - ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })} - ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })} - ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })} - ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} - ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}

diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz deleted file mode 100644 index 141e80920..000000000 Binary files a/examples/server/public/index.html.gz and /dev/null differ diff --git a/examples/server/public/index.js b/examples/server/public/index.js new file mode 100644 index 000000000..670960939 --- /dev/null +++ b/examples/server/public/index.js @@ -0,0 +1 @@ +const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,D,T,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,V=Array.isArray;function W(t,n){for(var e in n)t[e]=n[e];return t}function L(t){var n=t.parentNode;n&&n.removeChild(t)}function O(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return R(t,r,_,i,null)}function R(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function I(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?R(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,f),i.__i=u,o=null,-1!==u&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u===r+1?c++:u>r?f>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),R(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+T++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=W({},this.state),"function"==typeof t&&(t=t(W({},e),this.props)),t&&W(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),D=et(!0),T=0;var at,pt,dt,vt,yt=0,mt=[],gt=[],bt=S,kt=bt.__b,wt=bt.__r,St=bt.diffed,xt=bt.__c,Ct=bt.unmount,Ut=bt.__;function Et(t,n){bt.__h&&bt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:gt}),e.__[t]}function Ht(t){return yt=1,Pt(zt,t)}function Pt(t,n,e){var _=Et(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):zt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Nt(t,n){var e=Et(at++,3);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function $t(t,n){var e=Et(at++,4);!bt.__s&&Bt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function Dt(t){return yt=5,Mt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,$t((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Mt(t,n){var e=Et(at++,7);return Bt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Ft(t,n){return yt=8,Mt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Et(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Vt(t,n){bt.useDebugValue&&bt.useDebugValue(n?n(t):t)}function Wt(t){var n=Et(at++,10),e=Ht();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Et(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(jt),t.__H.__h.forEach(qt),t.__H.__h=[]}catch(n){t.__H.__h=[],bt.__e(n,t.__v)}}bt.__b=function(t){pt=null,kt&&kt(t)},bt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ut&&Ut(t,n)},bt.__r=function(t){wt&&wt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=gt,t.__N=t.i=void 0}))):(n.__h.forEach(jt),n.__h.forEach(qt),n.__h=[],at=0)),dt=pt},bt.diffed=function(t){St&&St(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===bt.requestAnimationFrame||((vt=bt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==gt&&(t.__=t.__V),t.i=void 0,t.__V=gt}))),dt=pt=null},bt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(jt),t.__h=t.__h.filter((function(t){return!t.__||qt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],bt.__e(r,t.__v)}})),xt&&xt(t,n)},bt.unmount=function(t){Ct&&Ct(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{jt(t)}catch(t){n=t}})),e.__H=void 0,n&&bt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function jt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function qt(t){var n=pt;t.__c=t.__(),pt=n}function Bt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function zt(t,n){return"function"==typeof n?n(t):n}function Gt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Jt,Kt;function Qt(t){if(Kt)Kt();Kt=t&&t.S()}function Xt({data:t}){const n=Zt(t);n.value=t;const e=Mt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Xt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Xt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});Gt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});Gt("__r",(t,n)=>{Qt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Jt=_;Qt(e);t(n)});Gt("__e",(t,n,e,_)=>{Qt();Jt=void 0;t(n,e,_)});Gt("diffed",(t,n)=>{Qt();Jt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Yt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Yt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}Gt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});Gt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Zt(t){return Mt(()=>c(t),[])}function tn(t){const n=Dt(t);n.current=t;Jt.__$f|=4;return Mt(()=>v(()=>n.current()),[])}function nn(t){const n=Dt(t);n.current=t;Nt(()=>k(()=>n.current()),[])}var en=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var rn=on.bind(O);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,O as createElement,I as createRef,k as effect,O as h,rn as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Ft as useCallback,tn as useComputed,At as useContext,Vt as useDebugValue,Nt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,$t as useLayoutEffect,Mt as useMemo,Pt as useReducer,Dt as useRef,Zt as useSignal,nn as useSignalEffect,Ht as useState}; diff --git a/examples/server/public_legacy/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs similarity index 99% rename from examples/server/public_legacy/json-schema-to-grammar.mjs rename to examples/server/public/json-schema-to-grammar.mjs index e67bb15c1..7267f3f9c 100644 --- a/examples/server/public_legacy/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -529,7 +529,7 @@ export class SchemaConverter { return joinSeq(); }; - return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space") + return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") } _notStrings(strings) { diff --git a/examples/server/public/loading.html b/examples/server/public/loading.html deleted file mode 100644 index c3fd19a0f..000000000 --- a/examples/server/public/loading.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - -
- The model is loading. Please wait.
- The user interface will appear soon. -
- - diff --git a/examples/server/public_legacy/prompt-formats.js b/examples/server/public/prompt-formats.js similarity index 100% rename from examples/server/public_legacy/prompt-formats.js rename to examples/server/public/prompt-formats.js diff --git a/examples/server/public_legacy/style.css b/examples/server/public/style.css old mode 100644 new mode 100755 similarity index 100% rename from examples/server/public_legacy/style.css rename to examples/server/public/style.css diff --git a/examples/server/public_legacy/system-prompts.js b/examples/server/public/system-prompts.js similarity index 100% rename from examples/server/public_legacy/system-prompts.js rename to examples/server/public/system-prompts.js diff --git a/examples/server/public_legacy/theme-beeninorder.css b/examples/server/public/theme-beeninorder.css similarity index 100% rename from examples/server/public_legacy/theme-beeninorder.css rename to examples/server/public/theme-beeninorder.css diff --git a/examples/server/public_legacy/theme-ketivah.css b/examples/server/public/theme-ketivah.css similarity index 100% rename from examples/server/public_legacy/theme-ketivah.css rename to examples/server/public/theme-ketivah.css diff --git a/examples/server/public_legacy/theme-mangotango.css b/examples/server/public/theme-mangotango.css similarity index 100% rename from examples/server/public_legacy/theme-mangotango.css rename to examples/server/public/theme-mangotango.css diff --git a/examples/server/public_legacy/theme-playground.css b/examples/server/public/theme-playground.css similarity index 100% rename from examples/server/public_legacy/theme-playground.css rename to examples/server/public/theme-playground.css diff --git a/examples/server/public_legacy/theme-polarnight.css b/examples/server/public/theme-polarnight.css similarity index 100% rename from examples/server/public_legacy/theme-polarnight.css rename to examples/server/public/theme-polarnight.css diff --git a/examples/server/public_legacy/theme-snowstorm.css b/examples/server/public/theme-snowstorm.css similarity index 100% rename from examples/server/public_legacy/theme-snowstorm.css rename to examples/server/public/theme-snowstorm.css diff --git a/examples/server/public_legacy/index.js b/examples/server/public_legacy/index.js deleted file mode 100644 index 32ec6e9e1..000000000 --- a/examples/server/public_legacy/index.js +++ /dev/null @@ -1 +0,0 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; diff --git a/examples/server/public_legacy/loading.html b/examples/server/public_legacy/loading.html deleted file mode 100644 index c3fd19a0f..000000000 --- a/examples/server/public_legacy/loading.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - -
- The model is loading. Please wait.
- The user interface will appear soon. -
- - diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 2fcd24a86..8e0df3b61 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -407,9 +407,6 @@ class SimpleChat { if (curLine.startsWith("data:")) { curLine = curLine.substring(5); } - if (curLine.trim() === "[DONE]") { - break; - } let curJson = JSON.parse(curLine); console.debug("DBUG:SC:PART:Json:", curJson); this.append_response(this.response_extract_stream(curJson, apiEP)); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0718806c8..7813a2957 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,1232 +1,137 @@ #include "utils.hpp" -#include "arg.h" #include "common.h" #include "json-schema-to-grammar.h" #include "llama.h" -#include "log.h" -#include "sampling.h" -#include "speculative.h" +#include "grammar-parser.h" +#ifndef NDEBUG +// crash the server in debug mode, otherwise send an http 500 error +#define CPPHTTPLIB_NO_EXCEPTIONS 1 +#endif +// increase max payload length to allow use of larger context size +#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 +#include "httplib.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" -// mime type for sending response -#define MIMETYPE_JSON "application/json; charset=utf-8" -// auto generated files (see README.md for details) -#include "index.html.gz.hpp" -#include "loading.html.hpp" +// auto generated files (update with ./deps.sh) +#include "colorthemes.css.hpp" +#include "style.css.hpp" +#include "theme-beeninorder.css.hpp" +#include "theme-ketivah.css.hpp" +#include "theme-mangotango.css.hpp" +#include "theme-playground.css.hpp" +#include "theme-polarnight.css.hpp" +#include "theme-snowstorm.css.hpp" +#include "index.html.hpp" +#include "index-new.html.hpp" +#include "index.js.hpp" +#include "completion.js.hpp" +#include "system-prompts.js.hpp" +#include "prompt-formats.js.hpp" +#include "json-schema-to-grammar.mjs.hpp" #include #include #include #include -#include -#include -#include +#include #include -#include #include -#include -#include +#include +#include using json = nlohmann::ordered_json; -constexpr int HTTP_POLLING_SECONDS = 1; +bool server_verbose = false; +bool server_log_json = true; enum stop_type { - STOP_TYPE_NONE, - STOP_TYPE_EOS, - STOP_TYPE_WORD, - STOP_TYPE_LIMIT, + STOP_TYPE_FULL, + STOP_TYPE_PARTIAL, }; -// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, - SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future - SLOT_STATE_PROCESSING_PROMPT, - SLOT_STATE_DONE_PROMPT, - SLOT_STATE_GENERATING, + SLOT_STATE_PROCESSING, +}; + +enum slot_command { + SLOT_COMMAND_NONE, + SLOT_COMMAND_LOAD_PROMPT, + SLOT_COMMAND_RELEASE, }; enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_READY, // Server is ready and model is loaded + SERVER_STATE_ERROR // An error occurred, load_model failed }; enum server_task_type { SERVER_TASK_TYPE_COMPLETION, - SERVER_TASK_TYPE_EMBEDDING, - SERVER_TASK_TYPE_RERANK, - SERVER_TASK_TYPE_INFILL, SERVER_TASK_TYPE_CANCEL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, - SERVER_TASK_TYPE_SET_LORA, -}; - -enum oaicompat_type { - OAICOMPAT_TYPE_NONE, - OAICOMPAT_TYPE_CHAT, - OAICOMPAT_TYPE_COMPLETION, - OAICOMPAT_TYPE_EMBEDDING, -}; - -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt - bool return_tokens = false; - - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict - int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters - - int64_t t_max_prompt_ms = -1; // TODO: implement - int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit - - std::vector lora; - - std::vector antiprompt; - std::vector response_fields; - bool timings_per_token = false; - bool post_sampling_probs = false; - bool ignore_eos = false; - - struct common_params_sampling sampling; - struct common_params_speculative speculative; - - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - common_chat_format oaicompat_chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - - json to_json() const { - std::vector samplers; - samplers.reserve(sampling.samplers.size()); - for (const auto & sampler : sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } - - json lora = json::array(); - for (size_t i = 0; i < this->lora.size(); ++i) { - lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); - } - - std::vector grammar_trigger_words; - for (const auto & trigger : sampling.grammar_trigger_words) { - grammar_trigger_words.push_back(trigger.word); - } - - return json { - {"n_predict", n_predict}, // Server configured n_predict - {"seed", sampling.seed}, - {"temperature", sampling.temp}, - {"dynatemp_range", sampling.dynatemp_range}, - {"dynatemp_exponent", sampling.dynatemp_exponent}, - {"top_k", sampling.top_k}, - {"top_p", sampling.top_p}, - {"min_p", sampling.min_p}, - {"xtc_probability", sampling.xtc_probability}, - {"xtc_threshold", sampling.xtc_threshold}, - {"typical_p", sampling.typ_p}, - {"repeat_last_n", sampling.penalty_last_n}, - {"repeat_penalty", sampling.penalty_repeat}, - {"presence_penalty", sampling.penalty_present}, - {"frequency_penalty", sampling.penalty_freq}, - {"dry_multiplier", sampling.dry_multiplier}, - {"dry_base", sampling.dry_base}, - {"dry_allowed_length", sampling.dry_allowed_length}, - {"dry_penalty_last_n", sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", sampling.dry_sequence_breakers}, - {"mirostat", sampling.mirostat}, - {"mirostat_tau", sampling.mirostat_tau}, - {"mirostat_eta", sampling.mirostat_eta}, - {"stop", antiprompt}, - {"max_tokens", n_predict}, // User configured n_predict - {"n_keep", n_keep}, - {"n_discard", n_discard}, - {"ignore_eos", sampling.ignore_eos}, - {"stream", stream}, - {"logit_bias", format_logit_bias(sampling.logit_bias)}, - {"n_probs", sampling.n_probs}, - {"min_keep", sampling.min_keep}, - {"grammar", sampling.grammar}, - {"grammar_trigger_words", grammar_trigger_words}, - {"grammar_trigger_tokens", sampling.grammar_trigger_tokens}, - {"preserved_tokens", sampling.preserved_tokens}, - {"samplers", samplers}, - {"speculative.n_max", speculative.n_max}, - {"speculative.n_min", speculative.n_min}, - {"speculative.p_min", speculative.p_min}, - {"timings_per_token", timings_per_token}, - {"post_sampling_probs", post_sampling_probs}, - {"lora", lora}, - }; - } }; struct server_task { - int id = -1; // to be filled by server_queue - int index = -1; // used when there are multiple prompts (batch request) - - server_task_type type; - - // used by SERVER_TASK_TYPE_CANCEL + int id = -1; // to be filled by server_queue + int id_multi = -1; int id_target = -1; - // used by SERVER_TASK_TYPE_INFERENCE - slot_params params; - llama_tokens prompt_tokens; - int id_selected_slot = -1; + server_task_type type; + json data; - // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE - struct slot_action { - int slot_id; - std::string filename; - std::string filepath; - }; - slot_action slot_action; - - // used by SERVER_TASK_TYPE_METRICS - bool metrics_reset_bucket = false; - - // used by SERVER_TASK_TYPE_SET_LORA - std::vector set_lora; - - server_task(server_task_type type) : type(type) {} - - static slot_params params_from_json_cmpl( - const llama_context * ctx, - const common_params & params_base, - const json & data) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - slot_params params; - - // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - slot_params defaults; - defaults.sampling = params_base.sampling; - defaults.speculative = params_base.speculative; - - // enabling this will output extra debug information in the HTTP responses from the server - params.verbose = params_base.verbosity > 9; - params.timings_per_token = json_value(data, "timings_per_token", false); - - params.stream = json_value(data, "stream", false); - params.cache_prompt = json_value(data, "cache_prompt", true); - params.return_tokens = json_value(data, "return_tokens", false); - params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); - params.n_indent = json_value(data, "n_indent", defaults.n_indent); - params.n_keep = json_value(data, "n_keep", defaults.n_keep); - params.n_discard = json_value(data, "n_discard", defaults.n_discard); - //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement - params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); - params.response_fields = json_value(data, "response_fields", std::vector()); - - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); - - params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); - params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); - params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); - - params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); - params.speculative.n_min = std::max(params.speculative.n_min, 2); - params.speculative.n_max = std::max(params.speculative.n_max, 0); - - // Use OpenAI API logprobs only if n_probs wasn't provided - if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){ - params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs); - } - - if (data.contains("lora")) { - if (data.at("lora").is_array()) { - params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora")); - } else { - throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); - } - } else { - params.lora = params_base.lora_adapters; - } - - // TODO: add more sanity checks for the input parameters - - if (params.sampling.penalty_last_n < -1) { - throw std::runtime_error("Error: repeat_last_n must be >= -1"); - } - - if (params.sampling.dry_penalty_last_n < -1) { - throw std::runtime_error("Error: dry_penalty_last_n must be >= -1"); - } - - if (params.sampling.penalty_last_n == -1) { - // note: should be the slot's context and not the full context, but it's ok - params.sampling.penalty_last_n = llama_n_ctx(ctx); - } - - if (params.sampling.dry_penalty_last_n == -1) { - params.sampling.dry_penalty_last_n = llama_n_ctx(ctx); - } - - if (params.sampling.dry_base < 1.0f) { - params.sampling.dry_base = defaults.sampling.dry_base; - } - - // sequence breakers for DRY - { - // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format - // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 - - if (data.contains("dry_sequence_breakers")) { - params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (params.sampling.dry_sequence_breakers.empty()) { - throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); - } - } - } - - // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { - throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); - } - if (data.contains("json_schema") && !data.contains("grammar")) { - try { - auto schema = json_value(data, "json_schema", json::object()); - SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str()); - params.sampling.grammar = json_schema_to_grammar(schema); - SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str()); - } catch (const std::exception & e) { - throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); - } - } else { - params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); - SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str()); - params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy); - SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); - } - - { - auto it = data.find("chat_format"); - if (it != data.end()) { - params.oaicompat_chat_format = static_cast(it->get()); - SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str()); - } else { - params.oaicompat_chat_format = defaults.oaicompat_chat_format; - } - } - - { - const auto grammar_triggers = data.find("grammar_triggers"); - if (grammar_triggers != data.end()) { - for (const auto & t : *grammar_triggers) { - common_grammar_trigger trigger; - trigger.word = t.at("word"); - trigger.at_start = t.at("at_start"); - - auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str()); - params.sampling.grammar_trigger_tokens.push_back(ids[0]); - params.sampling.preserved_tokens.insert(ids[0]); - continue; - } - SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str()); - params.sampling.grammar_trigger_words.push_back(trigger); - } - } - const auto preserved_tokens = data.find("preserved_tokens"); - if (preserved_tokens != data.end()) { - for (const auto & t : *preserved_tokens) { - auto ids = common_tokenize(vocab, t.get(), /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - SRV_DBG("Preserved token: %d\n", ids[0]); - params.sampling.preserved_tokens.insert(ids[0]); - } else { - // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. - SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get().c_str()); - } - } - } - if (params.sampling.grammar_lazy) { - GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0); - } - } - - { - params.sampling.logit_bias.clear(); - params.ignore_eos = json_value(data, "ignore_eos", false); - - const auto & logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_vocab_n_tokens(vocab); - for (const auto & el : *logit_bias) { - // TODO: we may want to throw errors here, in case "el" is incorrect - if (el.is_array() && el.size() == 2) { - float bias; - if (el[1].is_number()) { - bias = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - bias = -INFINITY; - } else { - continue; - } - - if (el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } else if (el[0].is_string()) { - auto toks = common_tokenize(vocab, el[0].get(), false); - for (auto tok : toks) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - } - } - - { - params.antiprompt.clear(); - - const auto & stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto & word : *stop) { - if (!word.empty()) { - params.antiprompt.push_back(word); - } - } - } - } - - { - const auto samplers = data.find("samplers"); - if (samplers != data.end()) { - if (samplers->is_array()) { - params.sampling.samplers = common_sampler_types_from_names(*samplers, false); - } else if (samplers->is_string()){ - params.sampling.samplers = common_sampler_types_from_chars(samplers->get()); - } - } else { - params.sampling.samplers = defaults.sampling.samplers; - } - } - - std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; - params.oaicompat_model = json_value(data, "model", model_name); - - return params; - } - - // utility function - static std::unordered_set get_list_id(const std::vector & tasks) { - std::unordered_set ids(tasks.size()); - for (size_t i = 0; i < tasks.size(); i++) { - ids.insert(tasks[i].id); - } - return ids; - } -}; - -struct result_timings { - int32_t prompt_n = -1; - double prompt_ms; - double prompt_per_token_ms; - double prompt_per_second; - - int32_t predicted_n = -1; - double predicted_ms; - double predicted_per_token_ms; - double predicted_per_second; - - json to_json() const { - return { - {"prompt_n", prompt_n}, - {"prompt_ms", prompt_ms}, - {"prompt_per_token_ms", prompt_per_token_ms}, - {"prompt_per_second", prompt_per_second}, - - {"predicted_n", predicted_n}, - {"predicted_ms", predicted_ms}, - {"predicted_per_token_ms", predicted_per_token_ms}, - {"predicted_per_second", predicted_per_second}, - }; - } + bool infill = false; + bool embedding = false; }; struct server_task_result { - int id = -1; - int id_slot = -1; - virtual bool is_error() { - // only used by server_task_result_error - return false; - } - virtual bool is_stop() { - // only used by server_task_result_cmpl_* - return false; - } - virtual int get_index() { - return -1; - } - virtual json to_json() = 0; - virtual ~server_task_result() = default; + int id = -1; + int id_multi = -1; + + json data; + + bool stop; + bool error; }; -// using shared_ptr for polymorphism of server_task_result -using server_task_result_ptr = std::unique_ptr; +struct server_task_multi { + int id = -1; -inline std::string stop_type_to_str(stop_type type) { - switch (type) { - case STOP_TYPE_EOS: return "eos"; - case STOP_TYPE_WORD: return "word"; - case STOP_TYPE_LIMIT: return "limit"; - default: return "none"; - } -} - -struct completion_token_output { - llama_token tok; - float prob; - std::string text_to_send; - struct prob_info { - llama_token tok; - std::string txt; - float prob; - }; - std::vector probs; - - json to_json(bool post_sampling_probs) const { - json probs_for_token = json::array(); - for (const auto & p : probs) { - std::string txt(p.txt); - txt.resize(validate_utf8(txt)); - probs_for_token.push_back(json { - {"id", p.tok}, - {"token", txt}, - {"bytes", str_to_bytes(p.txt)}, - { - post_sampling_probs ? "prob" : "logprob", - post_sampling_probs ? p.prob : logarithm(p.prob) - }, - }); - } - return probs_for_token; - } - - static json probs_vector_to_json(const std::vector & probs, bool post_sampling_probs) { - json out = json::array(); - for (const auto & p : probs) { - std::string txt(p.text_to_send); - txt.resize(validate_utf8(txt)); - out.push_back(json { - {"id", p.tok}, - {"token", txt}, - {"bytes", str_to_bytes(p.text_to_send)}, - { - post_sampling_probs ? "prob" : "logprob", - post_sampling_probs ? p.prob : logarithm(p.prob) - }, - { - post_sampling_probs ? "top_probs" : "top_logprobs", - p.to_json(post_sampling_probs) - }, - }); - } - return out; - } - - static float logarithm(float x) { - // nlohmann::json converts -inf to null, so we need to prevent that - return x == 0.0f ? std::numeric_limits::lowest() : std::log(x); - } - - static std::vector str_to_bytes(const std::string & str) { - std::vector bytes; - for (unsigned char c : str) { - bytes.push_back(c); - } - return bytes; - } + std::set subtasks_remaining; + std::vector results; }; -struct server_task_result_cmpl_final : server_task_result { - int index = 0; +struct slot_params { + bool stream = true; + bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - std::string content; - llama_tokens tokens; + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict - bool stream; - result_timings timings; - std::string prompt; + std::vector antiprompt; - bool truncated; - int32_t n_decoded; - int32_t n_prompt_tokens; - int32_t n_tokens_cached; - bool has_new_line; - std::string stopping_word; - stop_type stop = STOP_TYPE_NONE; - - bool post_sampling_probs; - std::vector probs_output; - std::vector response_fields; - - slot_params generation_params; - - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - common_chat_format oaicompat_chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - - virtual int get_index() override { - return index; - } - - virtual bool is_stop() override { - return true; // in stream mode, final responses are considered stop - } - - virtual json to_json() override { - switch (oaicompat) { - case OAICOMPAT_TYPE_NONE: - return to_json_non_oaicompat(); - case OAICOMPAT_TYPE_COMPLETION: - return to_json_oaicompat(); - case OAICOMPAT_TYPE_CHAT: - return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); - default: - GGML_ASSERT(false && "Invalid oaicompat_type"); - } - } - - json to_json_non_oaicompat() { - json res = json { - {"index", index}, - {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk - {"tokens", stream ? llama_tokens {} : tokens}, - {"id_slot", id_slot}, - {"stop", true}, - {"model", oaicompat_model}, - {"tokens_predicted", n_decoded}, - {"tokens_evaluated", n_prompt_tokens}, - {"generation_settings", generation_params.to_json()}, - {"prompt", prompt}, - {"has_new_line", has_new_line}, - {"truncated", truncated}, - {"stop_type", stop_type_to_str(stop)}, - {"stopping_word", stopping_word}, - {"tokens_cached", n_tokens_cached}, - {"timings", timings.to_json()}, - }; - if (!stream && !probs_output.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs); - } - return response_fields.empty() ? res : json_get_nested_values(response_fields, res); - } - - json to_json_oaicompat() { - std::time_t t = std::time(0); - json logprobs = json(nullptr); // OAI default to null - if (!stream && probs_output.size() > 0) { - logprobs = json{ - {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, - }; - } - json finish_reason = "length"; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } - json res = json { - {"choices", json::array({ - json{ - {"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk - {"index", index}, - {"logprobs", logprobs}, - {"finish_reason", finish_reason}, - } - })}, - {"created", t}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "text_completion"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens} - }}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json_non_oaicompat(); - } - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - - return res; - } - - json to_json_oaicompat_chat() { - std::string finish_reason = "length"; - common_chat_msg msg; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - SRV_DBG("Parsing chat message: %s\n", content.c_str()); - msg = common_chat_parse(content, oaicompat_chat_format); - finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls"; - } else { - msg.content = content; - } - - json tool_calls; - if (!msg.tool_calls.empty()) { - tool_calls = json::array(); - for (const auto & tc : msg.tool_calls) { - tool_calls.push_back({ - {"type", "function"}, - {"function", { - {"name", tc.name}, - {"arguments", tc.arguments}, - }}, - {"id", tc.id}, - }); - } - } - - json message { - {"content", msg.content}, - {"tool_calls", tool_calls}, - {"role", "assistant"}, - }; - if (!msg.tool_plan.empty()) { - message["tool_plan"] = msg.tool_plan; - } - - json choice { - {"finish_reason", finish_reason}, - {"index", 0}, - {"message", message}, - }; - - if (!stream && probs_output.size() > 0) { - choice["logprobs"] = json{ - {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, - }; - } - - std::time_t t = std::time(0); - - json res = json { - {"choices", json::array({choice})}, - {"created", t}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens} - }}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json_non_oaicompat(); - } - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - - return res; - } - - json to_json_oaicompat_chat_stream() { - std::time_t t = std::time(0); - std::string finish_reason = "length"; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } - - json choice = json { - {"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()} - }; - - json ret = json { - {"choices", json::array({choice})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion.chunk"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens}, - }}, - }; - - if (timings.prompt_n >= 0) { - ret.push_back({"timings", timings.to_json()}); - } - - return ret; - } -}; - -struct server_task_result_cmpl_partial : server_task_result { - int index = 0; - - std::string content; - llama_tokens tokens; - - int32_t n_decoded; - int32_t n_prompt_tokens; - - bool post_sampling_probs; - completion_token_output prob_output; - result_timings timings; - - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - - virtual int get_index() override { - return index; - } - - virtual bool is_stop() override { - return false; // in stream mode, partial responses are not considered stop - } - - virtual json to_json() override { - switch (oaicompat) { - case OAICOMPAT_TYPE_NONE: - return to_json_non_oaicompat(); - case OAICOMPAT_TYPE_COMPLETION: - return to_json_oaicompat(); - case OAICOMPAT_TYPE_CHAT: - return to_json_oaicompat_chat(); - default: - GGML_ASSERT(false && "Invalid oaicompat_type"); - } - } - - json to_json_non_oaicompat() { - // non-OAI-compat JSON - json res = json { - {"index", index}, - {"content", content}, - {"tokens", tokens}, - {"stop", false}, - {"id_slot", id_slot}, - {"tokens_predicted", n_decoded}, - {"tokens_evaluated", n_prompt_tokens}, - }; - // populate the timings object when needed (usually for the last response or with timings_per_token enabled) - if (timings.prompt_n > 0) { - res.push_back({"timings", timings.to_json()}); - } - if (!prob_output.probs.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs); - } - return res; - } - - json to_json_oaicompat() { - std::time_t t = std::time(0); - json logprobs = json(nullptr); // OAI default to null - if (prob_output.probs.size() > 0) { - logprobs = json{ - {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, - }; - } - json res = json { - {"choices", json::array({ - json{ - {"text", content}, - {"index", index}, - {"logprobs", logprobs}, - {"finish_reason", nullptr}, - } - })}, - {"created", t}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "text_completion"}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json_non_oaicompat(); - } - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - - return res; - } - - json to_json_oaicompat_chat() { - bool first = n_decoded == 0; - std::time_t t = std::time(0); - json choices; - - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json { - {"content", content}}} - }})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json { - {"content", content}, - }}, - }}); - } - - GGML_ASSERT(choices.size() >= 1); - - if (prob_output.probs.size() > 0) { - choices[0]["logprobs"] = json{ - {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, - }; - } - - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion.chunk"} - }; - - if (timings.prompt_n >= 0) { - ret.push_back({"timings", timings.to_json()}); - } - - return std::vector({ret}); - } -}; - -struct server_task_result_embd : server_task_result { - int index = 0; - std::vector> embedding; - - int32_t n_tokens; - - // OAI-compat fields - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - return oaicompat == OAICOMPAT_TYPE_EMBEDDING - ? to_json_oaicompat() - : to_json_non_oaicompat(); - } - - json to_json_non_oaicompat() { - return json { - {"index", index}, - {"embedding", embedding}, - }; - } - - json to_json_oaicompat() { - return json { - {"index", index}, - {"embedding", embedding[0]}, - {"tokens_evaluated", n_tokens}, - }; - } -}; - -struct server_task_result_rerank : server_task_result { - int index = 0; - float score = -1e6; - - int32_t n_tokens; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - return json { - {"index", index}, - {"score", score}, - {"tokens_evaluated", n_tokens}, - }; - } -}; - -// this function maybe used outside of server_task_result_error -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} - -struct server_task_result_error : server_task_result { - int index = 0; - error_type err_type = ERROR_TYPE_SERVER; - std::string err_msg; - - virtual bool is_error() override { - return true; - } - - virtual json to_json() override { - return format_error_response(err_msg, err_type); - } -}; - -struct server_task_result_metrics : server_task_result { - int n_idle_slots; - int n_processing_slots; - int n_tasks_deferred; - int64_t t_start; - - int32_t kv_cache_tokens_count; - int32_t kv_cache_used_cells; - - // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t t_prompt_processing_total = 0; - uint64_t n_tokens_predicted_total = 0; - uint64_t t_tokens_generation_total = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - uint64_t n_decode_total = 0; - uint64_t n_busy_slots_total = 0; - - // while we can also use std::vector this requires copying the slot object which can be quite messy - // therefore, we use json to temporarily store the slot.to_json() result - json slots_data = json::array(); - - virtual json to_json() override { - return json { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", n_tasks_deferred }, - { "t_start", t_start }, - - { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, - { "t_tokens_generation_total", t_tokens_generation_total }, - { "n_tokens_predicted_total", n_tokens_predicted_total }, - { "t_prompt_processing_total", t_prompt_processing_total }, - - { "n_prompt_tokens_processed", n_prompt_tokens_processed }, - { "t_prompt_processing", t_prompt_processing }, - { "n_tokens_predicted", n_tokens_predicted }, - { "t_tokens_generation", t_tokens_generation }, - - { "n_decode_total", n_decode_total }, - { "n_busy_slots_total", n_busy_slots_total }, - - { "kv_cache_tokens_count", kv_cache_tokens_count }, - { "kv_cache_used_cells", kv_cache_used_cells }, - - { "slots", slots_data }, - }; - } -}; - -struct server_task_result_slot_save_load : server_task_result { - std::string filename; - bool is_save; // true = save, false = load - - size_t n_tokens; - size_t n_bytes; - double t_ms; - - virtual json to_json() override { - if (is_save) { - return json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", n_tokens }, - { "n_written", n_bytes }, - { "timings", { - { "save_ms", t_ms } - }}, - }; - } else { - return json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", n_tokens }, - { "n_read", n_bytes }, - { "timings", { - { "restore_ms", t_ms } - }}, - }; - } - } -}; - -struct server_task_result_slot_erase : server_task_result { - size_t n_erased; - - virtual json to_json() override { - return json { - { "id_slot", id_slot }, - { "n_erased", n_erased }, - }; - } -}; - -struct server_task_result_apply_lora : server_task_result { - virtual json to_json() override { - return json {{ "success", true }}; - } + json input_prefix; + json input_suffix; }; struct server_slot { int id; int id_task = -1; - - // only used for completion/embedding/infill/rerank - server_task_type task_type = SERVER_TASK_TYPE_COMPLETION; - - llama_batch batch_spec = {}; - - llama_context * ctx = nullptr; - llama_context * ctx_dft = nullptr; - - common_speculative * spec = nullptr; - - std::vector lora; - - // the index relative to completion multi-task request - size_t index = 0; + int id_multi = -1; struct slot_params params; slot_state state = SLOT_STATE_IDLE; + slot_command command = SLOT_COMMAND_NONE; // used to determine the slot that has been used the longest int64_t t_last_used = -1; @@ -1239,77 +144,72 @@ struct server_slot { int32_t i_batch = -1; int32_t n_predict = -1; // TODO: disambiguate from params.n_predict - // n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated int32_t n_prompt_tokens = 0; int32_t n_prompt_tokens_processed = 0; - // input prompt tokens - llama_tokens prompt_tokens; + json prompt; // can be either a string, array of strings or array of token ids - size_t last_nl_pos = 0; - - std::string generated_text; - llama_tokens generated_tokens; - - llama_tokens cache_tokens; + // when a task is submitted, we first tokenize the prompt and store it here + std::vector prompt_tokens; + std::string generated_text; + std::vector cache_tokens; std::vector generated_token_probs; + bool infill = false; + bool embedding = false; bool has_next_token = true; - bool has_new_line = false; bool truncated = false; - stop_type stop; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + bool oaicompat = false; + + std::string oaicompat_model; std::string stopping_word; // sampling + llama_token sampled; + struct llama_sampling_params sparams; + llama_sampling_context * ctx_sampling = nullptr; json json_schema; - struct common_sampler * smpl = nullptr; + int32_t ga_i = 0; // group-attention state + int32_t ga_n = 1; // group-attention factor + int32_t ga_w = 512; // group-attention width - llama_token sampled; - - common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + int32_t n_past_se = 0; // self-extend // stats - size_t n_sent_text = 0; // number of sent text character + size_t n_sent_text = 0; // number of sent text character + size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_generation; double t_prompt_processing; // ms - double t_token_generation; // ms - - std::function callback_on_release; + double t_token_generation; // ms void reset() { - SLT_DBG(*this, "%s", "\n"); - n_prompt_tokens = 0; - last_nl_pos = 0; generated_text = ""; - has_new_line = false; truncated = false; - stop = STOP_TYPE_NONE; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; stopping_word = ""; n_past = 0; n_sent_text = 0; - task_type = SERVER_TASK_TYPE_COMPLETION; + n_sent_token_probs = 0; + infill = false; + ga_i = 0; + n_past_se = 0; - generated_tokens.clear(); generated_token_probs.clear(); } - bool is_non_causal() const { - return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; - } - - bool can_batch_with(server_slot & other_slot) { - return is_non_causal() == other_slot.is_non_causal() - && are_lora_equal(lora, other_slot.lora); - } - - bool has_budget(const common_params & global_params) { + bool has_budget(gpt_params &global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -1325,67 +225,60 @@ struct server_slot { return n_remaining > 0; // no budget } + bool available() const { + return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE; + } + bool is_processing() const { - return state != SLOT_STATE_IDLE; + return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING; } - bool can_speculate() const { - return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt; - } - - void add_token(const completion_token_output & token) { - if (!is_processing()) { - SLT_WRN(*this, "%s", "slot is not processing\n"); + void add_token_string(const completion_token_output & token) { + if (command == SLOT_COMMAND_RELEASE) { return; } generated_token_probs.push_back(token); } void release() { - if (is_processing()) { - SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); - - t_last_used = ggml_time_us(); + if (state == SLOT_STATE_PROCESSING) { t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; - state = SLOT_STATE_IDLE; - callback_on_release(id); + command = SLOT_COMMAND_RELEASE; } } - result_timings get_timings() const { - result_timings timings; - timings.prompt_n = n_prompt_tokens_processed; - timings.prompt_ms = t_prompt_processing; - timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; - timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + json get_formated_timings() const { + return json { + {"prompt_n", n_prompt_tokens_processed}, + {"prompt_ms", t_prompt_processing}, + {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, - timings.predicted_n = n_decoded; - timings.predicted_ms = t_token_generation; - timings.predicted_per_token_ms = t_token_generation / n_decoded; - timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; - - return timings; + {"predicted_n", n_decoded}, + {"predicted_ms", t_token_generation}, + {"predicted_per_token_ms", t_token_generation / n_decoded}, + {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, + }; } - size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) { + size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) { size_t stop_pos = std::string::npos; for (const std::string & word : params.antiprompt) { size_t pos; - if (is_full_stop) { + if (type == STOP_TYPE_FULL) { const size_t tmp = word.size() + last_token_size; const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; pos = text.find(word, from_pos); } else { - // otherwise, partial stop pos = find_partial_stop_string(word, text); } if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { - if (is_full_stop) { - stop = STOP_TYPE_WORD; + if (type == STOP_TYPE_FULL) { + stopped_word = true; stopping_word = word; has_next_token = false; } @@ -1397,42 +290,49 @@ struct server_slot { } void print_timings() const { - const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; - const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + char buffer[512]; - const double t_gen = t_token_generation / n_decoded; - const double n_gen_second = 1e3 / t_token_generation * n_decoded; + double t_token = t_prompt_processing / n_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - SLT_INF(*this, - "\n" - "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, - t_token_generation, n_decoded, t_gen, n_gen_second, - t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); - } + snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, n_prompt_tokens_processed, + t_token, n_tokens_second); - json to_json() const { - return json { - {"id", id}, - {"id_task", id_task}, - {"n_ctx", n_ctx}, - {"speculative", can_speculate()}, - {"is_processing", is_processing()}, - {"non_causal", is_non_causal()}, - {"params", params.to_json()}, - {"prompt", common_detokenize(ctx, prompt_tokens)}, - {"next_token", - { - {"has_next_token", has_next_token}, - {"has_new_line", has_new_line}, - {"n_remain", n_remaining}, - {"n_decoded", n_decoded}, - {"stopping_word", stopping_word}, - } - }, - }; + LOG_INFO(buffer, { + {"id_slot", id}, + {"id_task", id_task}, + {"t_prompt_processing", t_prompt_processing}, + {"n_prompt_tokens_processed", n_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + + snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + + LOG_INFO(buffer, { + {"id_slot", id}, + {"id_task", id_task}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + + LOG_INFO(buffer, { + {"id_slot", id}, + {"id_task", id_task}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); } }; @@ -1450,9 +350,6 @@ struct server_metrics { uint64_t n_tokens_predicted = 0; uint64_t t_tokens_generation = 0; - uint64_t n_decode_total = 0; - uint64_t n_busy_slots_total = 0; - void init() { t_start = ggml_time_us(); } @@ -1471,15 +368,6 @@ struct server_metrics { t_tokens_generation_total += slot.t_token_generation; } - void on_decoded(const std::vector & slots) { - n_decode_total++; - for (const auto & slot : slots) { - if (slot.is_processing()) { - n_busy_slots_total++; - } - } - } - void reset_bucket() { n_prompt_tokens_processed = 0; t_prompt_processing = 0; @@ -1493,89 +381,68 @@ struct server_queue { bool running; // queues - std::deque queue_tasks; - std::deque queue_tasks_deferred; + std::vector queue_tasks; + std::vector queue_tasks_deferred; + + std::vector queue_multitasks; std::mutex mutex_tasks; std::condition_variable condition_tasks; // callback functions - std::function callback_new_task; - std::function callback_update_slots; + std::function callback_new_task; + std::function callback_finish_multitask; + std::function callback_update_slots; // Add a new task to the end of the queue - int post(server_task task, bool front = false) { + int post(server_task task) { std::unique_lock lock(mutex_tasks); - GGML_ASSERT(task.id != -1); - // if this is cancel task make sure to clean up pending tasks - if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d, front = %d\n", task.id, front); - if (front) { - queue_tasks.push_front(std::move(task)); - } else { - queue_tasks.push_back(std::move(task)); + if (task.id == -1) { + task.id = id++; + LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + queue_tasks.push_back(std::move(task)); condition_tasks.notify_one(); return task.id; } - // multi-task version of post() - int post(std::vector & tasks, bool front = false) { - std::unique_lock lock(mutex_tasks); - for (auto & task : tasks) { - if (task.id == -1) { - task.id = id++; - } - // if this is cancel task make sure to clean up pending tasks - if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); - if (front) { - queue_tasks.push_front(std::move(task)); - } else { - queue_tasks.push_back(std::move(task)); - } - } - condition_tasks.notify_one(); - return 0; - } - // Add a new task, but defer until one slot is available void defer(server_task task) { std::unique_lock lock(mutex_tasks); - QUE_DBG("defer task, id = %d\n", task.id); queue_tasks_deferred.push_back(std::move(task)); - condition_tasks.notify_one(); } - // Get the next id for creating a new task + // Get the next id for creating anew task int get_new_id() { std::unique_lock lock(mutex_tasks); int new_id = id++; + LOG_VERBOSE("new task id", {{"new_id", new_id}}); return new_id; } // Register function to process a new task - void on_new_task(std::function callback) { + void on_new_task(std::function callback) { callback_new_task = std::move(callback); } + // Register function to process a multitask when it is finished + void on_finish_multitask(std::function callback) { + callback_finish_multitask = std::move(callback); + } + // Register the function to be called when all slots data is ready to be processed void on_update_slots(std::function callback) { callback_update_slots = std::move(callback); } - // Call when the state of one slot is changed, it will move one task from deferred to main queue - void pop_deferred_task() { + // Call when the state of one slot is changed + void notify_slot_changed() { + // move deferred tasks back to main loop std::unique_lock lock(mutex_tasks); - if (!queue_tasks_deferred.empty()) { - queue_tasks.emplace_back(std::move(queue_tasks_deferred.front())); - queue_tasks_deferred.pop_front(); + for (auto & task : queue_tasks_deferred) { + queue_tasks.push_back(std::move(task)); } - condition_tasks.notify_one(); + queue_tasks_deferred.clear(); } // end the start_loop routine @@ -1596,7 +463,7 @@ struct server_queue { running = true; while (true) { - QUE_DBG("%s", "processing new tasks\n"); + LOG_VERBOSE("new task may arrive", {}); while (true) { std::unique_lock lock(mutex_tasks); @@ -1605,24 +472,39 @@ struct server_queue { break; } server_task task = queue_tasks.front(); - queue_tasks.pop_front(); + queue_tasks.erase(queue_tasks.begin()); lock.unlock(); + LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); + callback_new_task(task); + } - QUE_DBG("processing task, id = %d\n", task.id); - callback_new_task(std::move(task)); + LOG_VERBOSE("update_multitasks", {}); + + // check if we have any finished multitasks + auto queue_iterator = queue_multitasks.begin(); + while (queue_iterator != queue_multitasks.end()) { + if (queue_iterator->subtasks_remaining.empty()) { + // all subtasks done == multitask is done + server_task_multi current_multitask = *queue_iterator; + callback_finish_multitask(current_multitask); + // remove this multitask + queue_iterator = queue_multitasks.erase(queue_iterator); + } else { + ++queue_iterator; + } } // all tasks in the current loop is processed, slots data is now ready - QUE_DBG("%s", "update slots\n"); + LOG_VERBOSE("callback_update_slots", {}); callback_update_slots(); - QUE_DBG("%s", "waiting for new tasks\n"); + LOG_VERBOSE("wait for new task", {}); { std::unique_lock lock(mutex_tasks); if (queue_tasks.empty()) { if (!running) { - QUE_DBG("%s", "terminate\n"); + LOG_VERBOSE("ending start_loop", {}); return; } condition_tasks.wait(lock, [&]{ @@ -1633,130 +515,103 @@ struct server_queue { } } -private: - void cleanup_pending_task(int id_target) { - // no need lock because this is called exclusively by post() - auto rm_func = [id_target](const server_task & task) { - return task.id_target == id_target; - }; - queue_tasks.erase( - std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), - queue_tasks.end()); - queue_tasks_deferred.erase( - std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func), - queue_tasks_deferred.end()); + // + // functions to manage multitasks + // + + // add a multitask by specifying the id of all subtask (subtask is a server_task) + void add_multitask(int id_multi, std::vector & sub_ids) { + std::lock_guard lock(mutex_tasks); + server_task_multi multi; + multi.id = id_multi; + std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); + queue_multitasks.push_back(multi); + } + + // updatethe remaining subtasks, while appending results to multitask + void update_multitask(int id_multi, int id_sub, server_task_result & result) { + std::lock_guard lock(mutex_tasks); + for (auto & multitask : queue_multitasks) { + if (multitask.id == id_multi) { + multitask.subtasks_remaining.erase(id_sub); + multitask.results.push_back(result); + } + } } }; struct server_response { - // for keeping track of all tasks waiting for the result - std::unordered_set waiting_task_ids; + typedef std::function callback_multitask_t; + callback_multitask_t callback_update_multitask; - // the main result queue (using ptr for polymorphism) - std::vector queue_results; + // for keeping track of all tasks waiting for the result + std::set waiting_task_ids; + + // the main result queue + std::vector queue_results; std::mutex mutex_results; std::condition_variable condition_results; // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); + LOG_VERBOSE("waiting for task id", {{"id_task", id_task}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(id_task); } - void add_waiting_tasks(const std::vector & tasks) { - std::unique_lock lock(mutex_results); - - for (const auto & task : tasks) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); - waiting_task_ids.insert(task.id); - } - } - // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}}); std::unique_lock lock(mutex_results); waiting_task_ids.erase(id_task); - // make sure to clean up all pending results - queue_results.erase( - std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { - return res->id == id_task; - }), - queue_results.end()); } - void remove_waiting_task_ids(const std::unordered_set & id_tasks) { - std::unique_lock lock(mutex_results); - - for (const auto & id_task : id_tasks) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); - waiting_task_ids.erase(id_task); - } - } - - // This function blocks the thread until there is a response for one of the id_tasks - server_task_result_ptr recv(const std::unordered_set & id_tasks) { + // This function blocks the thread until there is a response for this id_task + server_task_result recv(int id_task) { while (true) { std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - for (size_t i = 0; i < queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // should never reach here - } - - // same as recv(), but have timeout in seconds - // if timeout is reached, nullptr is returned - server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { - while (true) { - std::unique_lock lock(mutex_results); - for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); + if (queue_results[i].id == id_task) { + assert(queue_results[i].id_multi == -1); + server_task_result res = queue_results[i]; queue_results.erase(queue_results.begin() + i); return res; } } - - std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); - if (cr_res == std::cv_status::timeout) { - return nullptr; - } } // should never reach here } - // single-task version of recv() - server_task_result_ptr recv(int id_task) { - std::unordered_set id_tasks = {id_task}; - return recv(id_tasks); + // Register the function to update multitask + void on_multitask_update(callback_multitask_t callback) { + callback_update_multitask = std::move(callback); } // Send a new result to a waiting id_task - void send(server_task_result_ptr && result) { - SRV_DBG("sending result for task id = %d\n", result->id); + void send(server_task_result result) { + LOG_VERBOSE("send new result", {{"id_task", result.id}}); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { - if (result->id == id_task) { - SRV_DBG("task id = %d pushed to result queue\n", result->id); + // LOG_TEE("waiting task id %i \n", id_task); + // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result + if (result.id_multi == id_task) { + LOG_VERBOSE("callback_update_multitask", {{"id_task", id_task}}); + callback_update_multitask(id_task, result.id, result); + continue; + } - queue_results.emplace_back(std::move(result)); + if (result.id == id_task) { + LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}}); + queue_results.push_back(result); condition_results.notify_all(); return; } @@ -1765,29 +620,24 @@ struct server_response { }; struct server_context { - common_params params_base; - - // note: keep these alive - they determine the lifetime of the model, context, etc. - common_init_result llama_init; - common_init_result llama_init_dft; - llama_model * model = nullptr; llama_context * ctx = nullptr; - const llama_vocab * vocab = nullptr; + gpt_params params; - llama_model * model_dft = nullptr; - - llama_context_params cparams_dft; - - llama_batch batch = {}; + llama_batch batch; bool clean_kv_cache = true; bool add_bos_token = true; - bool has_eos_token = false; int32_t n_ctx; // total context for all clients / slots + // system prompt + bool system_need_update = false; + + std::string system_prompt; + std::vector system_tokens; + // slots / clients std::vector slots; json default_generation_settings_for_props; @@ -1800,185 +650,156 @@ struct server_context { // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; - common_chat_templates chat_templates; - ~server_context() { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + + if (model) { + llama_free_model(model); + model = nullptr; + } + // Clear any sampling context for (server_slot & slot : slots) { - common_sampler_free(slot.smpl); - slot.smpl = nullptr; - - llama_free(slot.ctx_dft); - slot.ctx_dft = nullptr; - - common_speculative_free(slot.spec); - slot.spec = nullptr; - - llama_batch_free(slot.batch_spec); + if (slot.ctx_sampling != nullptr) { + llama_sampling_free(slot.ctx_sampling); + } } llama_batch_free(batch); } - bool load_model(const common_params & params) { - SRV_INF("loading model '%s'\n", params.model.c_str()); + bool load_model(const gpt_params & params_) { + params = params_; - params_base = params; - - llama_init = common_init_from_params(params_base); - - model = llama_init.model.get(); - ctx = llama_init.context.get(); + // dedicate one sequence to the system prompt + params.n_parallel += 1; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + params.n_parallel -= 1; // but be sneaky about it if (model == nullptr) { - SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str()); + LOG_ERROR("unable to load model", {{"model", params.model}}); return false; } - vocab = llama_model_get_vocab(model); - n_ctx = llama_n_ctx(ctx); - add_bos_token = llama_vocab_get_add_bos(vocab); - has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; - - if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) { - SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); - - auto params_dft = params_base; - - params_dft.devices = params_base.speculative.devices; - params_dft.hf_file = params_base.speculative.hf_file; - params_dft.hf_repo = params_base.speculative.hf_repo; - params_dft.model = params_base.speculative.model; - params_dft.model_url = params_base.speculative.model_url; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx; - params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; - params_dft.n_parallel = 1; - - llama_init_dft = common_init_from_params(params_dft); - - model_dft = llama_init_dft.model.get(); - - if (model_dft == nullptr) { - SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str()); - return false; - } - - if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) { - SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str()); - - return false; - } - - const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); - - cparams_dft = common_context_params_to_llama(params_dft); - cparams_dft.n_batch = n_ctx_dft; - - // force F16 KV cache for the draft model for extra performance - cparams_dft.type_k = GGML_TYPE_F16; - cparams_dft.type_v = GGML_TYPE_F16; - - // the context is not needed - we will create one for each slot - llama_init_dft.context.reset(); - } - - if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) { - SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); - chat_templates = common_chat_templates_from_model(model, "chatml"); - } else { - chat_templates = common_chat_templates_from_model(model, params_base.chat_template); - } - GGML_ASSERT(chat_templates.template_default.get() != nullptr); + add_bos_token = llama_should_add_bos_token(model); + GGML_ASSERT(llama_add_eos_token(model) != 1); return true; } - bool validate_builtin_chat_template(bool use_jinja) const { + bool validate_model_chat_template() const { llama_chat_message chat[] = {{"user", "test"}}; - if (use_jinja) { - auto templates = common_chat_templates_from_model(model, ""); - common_chat_inputs inputs; - inputs.messages = json::array({{ - {"role", "user"}, - {"content", "test"}, - }}); - GGML_ASSERT(templates.template_default); - try { - common_chat_params_init(*templates.template_default, inputs); - if (templates.template_tool_use) { - common_chat_params_init(*templates.template_tool_use, inputs); - } - return true; - } catch (const std::exception & e) { - SRV_ERR("failed to apply template: %s\n", e.what()); - return false; - } - } else { - const char * tmpl = llama_model_chat_template(model, /* name */ nullptr); - const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); - return chat_res > 0; - } + const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); + + return res > 0; } void init() { - const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; + const int32_t n_ctx_slot = n_ctx / params.n_parallel; - SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); + LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); - for (int i = 0; i < params_base.n_parallel; i++) { + for (int i = 0; i < params.n_parallel; i++) { server_slot slot; slot.id = i; - slot.ctx = ctx; slot.n_ctx = n_ctx_slot; - slot.n_predict = params_base.n_predict; + slot.n_predict = params.n_predict; - if (model_dft) { - slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); + LOG_INFO("new slot", { + {"id_slot", slot.id}, + {"n_ctx_slot", slot.n_ctx} + }); - slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); - if (slot.ctx_dft == nullptr) { - SRV_ERR("%s", "failed to create draft context\n"); - return; - } + const int ga_n = params.grp_attn_n; + const int ga_w = params.grp_attn_w; - slot.spec = common_speculative_init(slot.ctx_dft); - if (slot.spec == nullptr) { - SRV_ERR("%s", "failed to create speculator\n"); - return; - } + if (ga_n != 1) { + GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT + //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT + //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT + + LOG_INFO("slot self-extend", { + {"id_slot", slot.id}, + {"ga_n", ga_n}, + {"ga_w", ga_w} + }); } - SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); + slot.ga_i = 0; + slot.ga_n = ga_n; + slot.ga_w = ga_w; - slot.params.sampling = params_base.sampling; - - slot.callback_on_release = [this](int) { - queue_tasks.pop_deferred_task(); - }; + slot.sparams = params.sparams; slot.reset(); slots.push_back(slot); } - default_generation_settings_for_props = slots[0].to_json(); + default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props["seed"] = -1; - // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens + // the update_slots() logic will always submit a maximum of n_batch tokens // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); + batch = llama_batch_init(n_batch, 0, 1); } metrics.init(); } + std::vector tokenize(const json & json_prompt, bool add_special) const { + // TODO: currently, we tokenize using special tokens by default + // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) + // but it's better compared to completely ignoring ChatML and other chat templates + const bool TMP_FORCE_SPECIAL = true; + + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + std::vector prompt_tokens; + + if (json_prompt.is_array()) { + bool first = true; + for (const auto & p : json_prompt) { + if (p.is_string()) { + auto s = p.template get(); + + std::vector p; + if (first) { + p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + first = false; + } else { + p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); + } + + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } else { + if (first) { + first = false; + } + + prompt_tokens.push_back(p.template get()); + } + } + } else { + auto s = json_prompt.template get(); + prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + } + + return prompt_tokens; + } + server_slot * get_slot_by_id(int id) { for (server_slot & slot : slots) { if (slot.id == id) { @@ -1989,41 +810,50 @@ struct server_context { return nullptr; } - server_slot * get_available_slot(const server_task & task) { + server_slot * get_available_slot(const std::string & prompt) { server_slot * ret = nullptr; // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f) { - int lcs_len = 0; + if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) { + int max_lcp_len = 0; float similarity = 0; for (server_slot & slot : slots) { // skip the slot if it is not available - if (slot.is_processing()) { + if (!slot.available()) { continue; } - // skip the slot if it does not contains cached tokens - if (slot.cache_tokens.empty()) { + // skip the slot if it does not contains prompt + if (!slot.prompt.is_string()) { continue; } - // length of the Longest Common Subsequence between the current slot's prompt and the input prompt - int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens); + // current slot's prompt + std::string slot_prompt = slot.prompt.get(); - // fraction of the common subsequence length compared to the current slot's prompt length - float cur_similarity = static_cast(cur_lcs_len) / static_cast(slot.cache_tokens.size()); + // length of the current slot's prompt + int slot_prompt_len = slot_prompt.size(); + + // length of the Longest Common Prefix between the current slot's prompt and the input prompt + int lcp_len = common_part(slot_prompt, prompt); + + // fraction of the common substring length compared to the current slot's prompt length + similarity = static_cast(lcp_len) / slot_prompt_len; // select the current slot if the criteria match - if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) { - lcs_len = cur_lcs_len; - similarity = cur_similarity; + if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { + max_lcp_len = lcp_len; ret = &slot; } } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity); + LOG_VERBOSE("selected slot by lcp similarity", { + {"id_slot", ret->id}, + {"max_lcp_len", max_lcp_len}, + {"similarity", similarity}, + }); } } @@ -2032,7 +862,7 @@ struct server_context { int64_t t_last = ggml_time_us(); for (server_slot & slot : slots) { // skip the slot if it is not available - if (slot.is_processing()) { + if (!slot.available()) { continue; } @@ -2044,7 +874,10 @@ struct server_context { } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last); + LOG_VERBOSE("selected slot by lru", { + {"id_slot", ret->id}, + {"t_last", t_last}, + }); } } @@ -2052,108 +885,364 @@ struct server_context { } bool launch_slot_with_task(server_slot & slot, const server_task & task) { - slot.reset(); - slot.id_task = task.id; - slot.index = task.index; - slot.task_type = task.type; - slot.params = std::move(task.params); - slot.prompt_tokens = std::move(task.prompt_tokens); + slot_params default_params; + // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) + llama_sampling_params default_sparams = params.sparams; + auto & data = task.data; - if (!are_lora_equal(task.params.lora, slot.lora)) { - // if lora is changed, we cannot reuse cached tokens - slot.cache_tokens.clear(); - slot.lora = task.params.lora; + if (data.count("__oaicompat") != 0) { + slot.oaicompat = true; + slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + } else { + slot.oaicompat = false; + slot.oaicompat_model = ""; } - SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); + slot.params.stream = json_value(data, "stream", false); + slot.params.cache_prompt = json_value(data, "cache_prompt", false); + slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); + slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); + slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); + slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); + slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); + slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); + slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); + slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep); + slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); + slot.sparams.seed = json_value(data, "seed", default_sparams.seed); + slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + + // process "json_schema" and "grammar" + if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { + send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); + return false; + } else if (data.contains("json_schema") && !data.contains("grammar")) { + try { + auto schema = json_value(data, "json_schema", json::object()); + slot.sparams.grammar = json_schema_to_grammar(schema); + } catch (const std::exception & e) { + send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); + return false; + } + } else { + slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + } + + if (slot.params.cache_prompt && slot.ga_n != 1) { + LOG_WARNING("cache_prompt is not supported with group-attention", {}); + slot.params.cache_prompt = false; + } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? + LOG_WARNING("Max tokens to predict exceeds server configuration", { + {"params.n_predict", slot.params.n_predict}, + {"slot.n_predict", slot.n_predict}, + }); slot.params.n_predict = slot.n_predict; - SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } - if (slot.params.ignore_eos && has_eos_token) { - slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY}); + // infill + slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); + slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); + + // get prompt + if (!task.infill) { + const auto & prompt = data.find("prompt"); + if (prompt == data.end()) { + send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + if ((prompt->is_string()) || + (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || + (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) { + slot.prompt = *prompt; + } else { + send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST); + return false; + } + } + + // penalize user-provided tokens + { + slot.sparams.penalty_prompt_tokens.clear(); + slot.sparams.use_penalty_prompt_tokens = false; + + const auto & penalty_prompt = data.find("penalty_prompt"); + + if (penalty_prompt != data.end()) { + if (penalty_prompt->is_string()) { + const auto penalty_prompt_string = penalty_prompt->get(); + slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false); + + if (slot.params.n_predict > 0) { + slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict); + } + slot.sparams.use_penalty_prompt_tokens = true; + + LOG_VERBOSE("penalty_prompt_tokens", { + {"id_slot", slot.id}, + {"tokens", slot.sparams.penalty_prompt_tokens}, + }); + } + else if (penalty_prompt->is_array()) { + const auto n_tokens = penalty_prompt->size(); + slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict)); + + const int n_vocab = llama_n_vocab(model); + for (const auto & penalty_token : *penalty_prompt) { + if (penalty_token.is_number_integer()) { + const auto tok = penalty_token.get(); + if (tok >= 0 && tok < n_vocab) { + slot.sparams.penalty_prompt_tokens.push_back(tok); + } + } + } + slot.sparams.use_penalty_prompt_tokens = true; + + LOG_VERBOSE("penalty_prompt_tokens", { + {"id_slot", slot.id}, + {"tokens", slot.sparams.penalty_prompt_tokens}, + }); + } + } } { - if (slot.smpl != nullptr) { - common_sampler_free(slot.smpl); + slot.sparams.logit_bias.clear(); + + if (json_value(data, "ignore_eos", false)) { + slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } - slot.smpl = common_sampler_init(model, slot.params.sampling); - if (slot.smpl == nullptr) { + const auto & logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) { + const int n_vocab = llama_n_vocab(model); + for (const auto & el : *logit_bias) { + // TODO: we may want to throw errors here, in case "el" is incorrect + if (el.is_array() && el.size() == 2) { + float bias; + if (el[1].is_number()) { + bias = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + bias = -INFINITY; + } else { + continue; + } + + if (el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + slot.sparams.logit_bias[tok] = bias; + } + } else if (el[0].is_string()) { + auto toks = llama_tokenize(model, el[0].get(), false); + for (auto tok : toks) { + slot.sparams.logit_bias[tok] = bias; + } + } + } + } + } + } + + { + slot.params.antiprompt.clear(); + + const auto & stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + slot.params.antiprompt.push_back(word); + } + } + } + } + + { + const auto & samplers_sequence = data.find("samplers"); + if (samplers_sequence != data.end() && samplers_sequence->is_array()) { + std::vector sampler_names; + for (const auto & sampler_name : *samplers_sequence) { + if (sampler_name.is_string()) { + sampler_names.emplace_back(sampler_name); + } + } + slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); + } else { + slot.sparams.samplers_sequence = default_sparams.samplers_sequence; + } + } + + { + if (slot.ctx_sampling != nullptr) { + llama_sampling_free(slot.ctx_sampling); + } + slot.ctx_sampling = llama_sampling_init(slot.sparams); + if (slot.ctx_sampling == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); return false; } } - if (slot.ctx_dft) { - llama_batch_free(slot.batch_spec); + slot.command = SLOT_COMMAND_LOAD_PROMPT; + slot.prompt_tokens.clear(); - slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1); - } - - slot.state = SLOT_STATE_STARTED; - - SLT_INF(slot, "%s", "processing task\n"); + LOG_INFO("slot is processing task", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + }); return true; } void kv_cache_clear() { - SRV_DBG("%s", "clearing KV cache\n"); + LOG_VERBOSE("clearing KV cache", {}); // clear the entire KV cache llama_kv_cache_clear(ctx); clean_kv_cache = false; } + void system_prompt_update() { + LOG_VERBOSE("system prompt update", { + {"system_prompt", system_prompt}, + }); + + kv_cache_clear(); + system_tokens.clear(); + + if (!system_prompt.empty()) { + system_tokens = ::llama_tokenize(ctx, system_prompt, true); + + llama_batch_clear(batch); + + for (int i = 0; i < (int)system_tokens.size(); ++i) { + llama_batch_add(batch, system_tokens[i], i, { 0 }, false); + } + + const int32_t n_batch = llama_n_batch(ctx); + + for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + + if (llama_decode(ctx, batch_view) != 0) { + LOG_ERROR("llama_decode() failed", {}); + return; + } + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i <= params.n_parallel; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + } + } + + system_need_update = false; + } + + bool system_prompt_set(const std::string & sys_prompt) { + system_prompt = sys_prompt; + + LOG_VERBOSE("system prompt process", { + {"system_prompt", system_prompt}, + }); + + // release all slots + for (server_slot & slot : slots) { + slot.release(); + } + + system_need_update = true; + return true; + } + bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = result.text_to_send; + const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); slot.sampled = result.tok; + // search stop word and delete it slot.generated_text += token_str; - if (slot.params.return_tokens) { - slot.generated_tokens.push_back(result.tok); - } slot.has_next_token = true; - // check if there is incomplete UTF-8 character at the end - bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size(); + if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { + // we can change penalty_prompt_tokens because it is always created from scratch each request + slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); + } + + // check if there is incomplete UTF-8 character at the end + bool incomplete = false; + for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { + unsigned char c = slot.generated_text[slot.generated_text.size() - i]; + if ((c & 0xC0) == 0x80) { + // continuation byte: 10xxxxxx + continue; + } + if ((c & 0xE0) == 0xC0) { + // 2-byte character: 110xxxxx ... + incomplete = i < 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character: 1110xxxx ... + incomplete = i < 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character: 11110xxx ... + incomplete = i < 4; + } + // else 1-byte character or invalid byte + break; + } - // search stop word and delete it if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); - bool send_text = true; + bool is_stop_full = false; - size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true); + size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); if (stop_pos != std::string::npos) { + is_stop_full = true; slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else if (slot.has_next_token) { - stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); - send_text = stop_pos == std::string::npos; + } else { + is_stop_full = false; + stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); } // check if there is any token to predict - if (send_text) { + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { // no send the stop word in the response result.text_to_send = slot.generated_text.substr(pos, std::string::npos); slot.n_sent_text += result.text_to_send.size(); // add the token to slot queue and cache - } else { - result.text_to_send = ""; } - slot.add_token(result); + slot.add_token_string(result); if (slot.params.stream) { send_partial_response(slot, result); } @@ -2164,256 +1253,235 @@ struct server_context { } // check the limits - if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { - slot.stop = STOP_TYPE_LIMIT; + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { + slot.stopped_limit = true; slot.has_next_token = false; - SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); + LOG_VERBOSE("stopped by limit", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_decoded", slot.n_decoded}, + {"n_predict", slot.params.n_predict}, + }); } - if (slot.has_new_line) { - // if we have already seen a new line, we stop after a certain time limit - if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; + if (llama_token_is_eog(model, result.tok)) { + slot.stopped_eos = true; + slot.has_next_token = false; - SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); - } - - // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent - if (slot.params.n_indent > 0) { - // check the current indentation - // TODO: improve by not doing it more than once for each new line - if (slot.last_nl_pos > 0) { - size_t pos = slot.last_nl_pos; - - int n_indent = 0; - while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) { - n_indent++; - pos++; - } - - if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) { - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - // cut the last line - slot.generated_text.erase(pos, std::string::npos); - - SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent); - } - } - - // find the next new line - { - const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos); - - if (pos != std::string::npos) { - slot.last_nl_pos = pos + 1; - } - } - } + LOG_VERBOSE("eos token found", {}); } - // check if there is a new line in the generated text - if (result.text_to_send.find('\n') != std::string::npos) { - slot.has_new_line = true; - } - - // if context shift is disabled, we stop when it reaches the context limit - if (slot.n_past >= slot.n_ctx) { + auto n_ctx_train = llama_n_ctx_train(model); + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 + && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { + LOG_WARNING("n_predict is not set and self-context extend is disabled." + " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", { + { "id_slot", slot.id }, + { "params.n_predict", slot.params.n_predict }, + { "slot.n_prompt_tokens", slot.n_prompt_tokens }, + { "slot.n_decoded", slot.n_decoded }, + { "slot.n_predict", slot.n_predict }, + { "n_slots", params.n_parallel }, + { "slot.n_ctx", slot.n_ctx }, + { "n_ctx", n_ctx }, + { "n_ctx_train", n_ctx_train }, + { "ga_n", slot.ga_n }, + }); slot.truncated = true; - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", - slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); - } - - if (llama_vocab_is_eog(vocab, result.tok)) { - slot.stop = STOP_TYPE_EOS; - slot.has_next_token = false; - - SLT_DBG(slot, "%s", "stopped by EOS\n"); - } - - const auto n_ctx_train = llama_model_n_ctx_train(model); - - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { - slot.truncated = true; - slot.stop = STOP_TYPE_LIMIT; + slot.stopped_limit = true; slot.has_next_token = false; // stop prediction - - SLT_WRN(slot, - "n_predict (%d) is set for infinite generation. " - "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", - slot.params.n_predict, n_ctx_train); } - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); + LOG_VERBOSE("next token", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"token", result.tok}, + {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"n_decoded", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }); return slot.has_next_token; // continue } - void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { - size_t n_probs = slot.params.sampling.n_probs; - size_t n_vocab = llama_vocab_n_tokens(vocab); - if (post_sampling) { - const auto * cur_p = common_sampler_get_candidates(slot.smpl); - const size_t max_probs = cur_p->size; + json get_formated_generation(const server_slot & slot) const { + const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); + const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); - // set probability for sampled token - for (size_t i = 0; i < max_probs; i++) { - if (cur_p->data[i].id == result.tok) { - result.prob = cur_p->data[i].p; - break; - } - } - - // set probability for top n_probs tokens - result.probs.reserve(max_probs); - for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { - result.probs.push_back({ - cur_p->data[i].id, - common_detokenize(ctx, {cur_p->data[i].id}, special), - cur_p->data[i].p - }); - } - } else { - // TODO: optimize this with min-p optimization - std::vector cur = get_token_probabilities(ctx, idx); - - // set probability for sampled token - for (size_t i = 0; i < n_vocab; i++) { - // set probability for sampled token - if (cur[i].id == result.tok) { - result.prob = cur[i].p; - break; - } - } - - // set probability for top n_probs tokens - result.probs.reserve(n_probs); - for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { - result.probs.push_back({ - cur[i].id, - common_detokenize(ctx, {cur[i].id}, special), - cur[i].p - }); - } + std::vector samplers_sequence; + samplers_sequence.reserve(slot.sparams.samplers_sequence.size()); + for (const auto & sampler_type : slot.sparams.samplers_sequence) { + samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); } + + return json { + {"n_ctx", slot.n_ctx}, + {"n_predict", slot.n_predict}, + {"model", params.model_alias}, + {"seed", slot.sparams.seed}, + {"temperature", slot.sparams.temp}, + {"dynatemp_range", slot.sparams.dynatemp_range}, + {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, + {"top_k", slot.sparams.top_k}, + {"top_p", slot.sparams.top_p}, + {"min_p", slot.sparams.min_p}, + {"tfs_z", slot.sparams.tfs_z}, + {"typical_p", slot.sparams.typical_p}, + {"repeat_last_n", slot.sparams.penalty_last_n}, + {"repeat_penalty", slot.sparams.penalty_repeat}, + {"presence_penalty", slot.sparams.penalty_present}, + {"frequency_penalty", slot.sparams.penalty_freq}, + {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, + {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, + {"mirostat", slot.sparams.mirostat}, + {"mirostat_tau", slot.sparams.mirostat_tau}, + {"mirostat_eta", slot.sparams.mirostat_eta}, + {"penalize_nl", slot.sparams.penalize_nl}, + {"stop", slot.params.antiprompt}, + {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict + {"n_keep", slot.params.n_keep}, + {"n_discard", slot.params.n_discard}, + {"ignore_eos", ignore_eos}, + {"stream", slot.params.stream}, + {"logit_bias", slot.sparams.logit_bias}, + {"n_probs", slot.sparams.n_probs}, + {"min_keep", slot.sparams.min_keep}, + {"grammar", slot.sparams.grammar}, + {"samplers", samplers_sequence} + }; } void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(task.id, error, type); + send_error(task.id, task.id_multi, error, type); } void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(slot.id_task, error, type); + send_error(slot.id_task, slot.id_multi, error, type); } - void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); + void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + LOG_ERROR("task error", { + {"id_multi", id_multi}, + {"id_task", id_task}, + {"error", error}, + }); - auto res = std::make_unique(); - res->id = id_task; - res->err_type = type; - res->err_msg = error; + server_task_result res; + res.id = id_task; + res.id_multi = id_multi; + res.stop = false; + res.error = true; + res.data = format_error_response(error, type); - queue_results.send(std::move(res)); + queue_results.send(res); } - void send_partial_response(server_slot & slot, const completion_token_output & tkn) { - auto res = std::make_unique(); + void send_partial_response(server_slot & slot, completion_token_output tkn) { + server_task_result res; + res.id = slot.id_task; + res.id_multi = slot.id_multi; + res.error = false; + res.stop = false; + res.data = json { + {"content", tkn.text_to_send}, + {"stop", false}, + {"id_slot", slot.id}, + {"multimodal", false} + }; - res->id = slot.id_task; - res->index = slot.index; - res->content = tkn.text_to_send; - res->tokens = { tkn.tok }; + if (slot.sparams.n_probs > 0) { + const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); + const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); + const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.n_prompt_tokens; - res->post_sampling_probs = slot.params.post_sampling_probs; + std::vector probs_output; + if (probs_pos < probs_stop_pos) { + probs_output = std::vector( + slot.generated_token_probs.begin() + probs_pos, + slot.generated_token_probs.begin() + probs_stop_pos); + } + slot.n_sent_token_probs = probs_stop_pos; - res->verbose = slot.params.verbose; - res->oaicompat = slot.params.oaicompat; - res->oaicompat_model = slot.params.oaicompat_model; - res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; - - // populate res.probs_output - if (slot.params.sampling.n_probs > 0) { - res->prob_output = tkn; // copy the token probs + res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } - // populate timings if this is final response or timings_per_token is enabled - if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) { - res->timings = slot.get_timings(); + if (slot.oaicompat) { + res.data["oaicompat_token_ctr"] = slot.n_decoded; + res.data["model"] = slot.oaicompat_model; } - queue_results.send(std::move(res)); + queue_results.send(res); } - void send_final_response(server_slot & slot) { - auto res = std::make_unique(); - res->id = slot.id_task; - res->id_slot = slot.id; + void send_final_response(const server_slot & slot) { + server_task_result res; + res.id = slot.id_task; + res.id_multi = slot.id_multi; + res.error = false; + res.stop = true; + res.data = json { + {"content", !slot.params.stream ? slot.generated_text : ""}, + {"id_slot", slot.id}, + {"stop", true}, + {"model", params.model_alias}, + {"tokens_predicted", slot.n_decoded}, + {"tokens_evaluated", slot.n_prompt_tokens}, + {"generation_settings", get_formated_generation(slot)}, + {"prompt", slot.prompt}, + {"truncated", slot.truncated}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + {"tokens_cached", slot.n_past}, + {"timings", slot.get_formated_timings()} + }; - res->index = slot.index; - res->content = std::move(slot.generated_text); - res->tokens = std::move(slot.generated_tokens); - res->timings = slot.get_timings(); - res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); - res->response_fields = std::move(slot.params.response_fields); - - res->truncated = slot.truncated; - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.n_prompt_tokens; - res->n_tokens_cached = slot.n_past; - res->has_new_line = slot.has_new_line; - res->stopping_word = slot.stopping_word; - res->stop = slot.stop; - res->post_sampling_probs = slot.params.post_sampling_probs; - - res->verbose = slot.params.verbose; - res->stream = slot.params.stream; - res->oaicompat = slot.params.oaicompat; - res->oaicompat_model = slot.params.oaicompat_model; - res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; - res->oaicompat_chat_format = slot.params.oaicompat_chat_format; - // populate res.probs_output - if (slot.params.sampling.n_probs > 0) { - if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) { - const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); + if (slot.sparams.n_probs > 0) { + std::vector probs; + if (!slot.params.stream && slot.stopped_word) { + const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - res->probs_output = std::vector( + probs = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end() - safe_offset); } else { - res->probs_output = std::vector( + probs = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end()); } + + res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs); } - res->generation_params = slot.params; // copy the parameters + if (slot.oaicompat) { + res.data["oaicompat_token_ctr"] = slot.n_decoded; + res.data["model"] = slot.oaicompat_model; + } - queue_results.send(std::move(res)); + queue_results.send(res); } void send_embedding(const server_slot & slot, const llama_batch & batch) { - auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; - res->n_tokens = slot.n_prompt_tokens; - res->oaicompat = slot.params.oaicompat; + server_task_result res; + res.id = slot.id_task; + res.id_multi = slot.id_multi; + res.error = false; + res.stop = true; - const int n_embd = llama_model_n_embd(model); + const int n_embd = llama_n_embd(model); std::vector embd_res(n_embd, 0.0f); for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { continue; } @@ -2423,187 +1491,150 @@ struct server_context { } if (embd == NULL) { - SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); + LOG_ERROR("failed to get embeddings", { + {"token", batch.token [i]}, + {"seq_id", batch.seq_id[i][0]} + }); + + res.data = json { + {"embedding", std::vector(n_embd, 0.0f)}, + }; - res->embedding.push_back(std::vector(n_embd, 0.0f)); continue; } - // normalize only when there is pooling - // TODO: configurable - if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { - common_embd_normalize(embd, embd_res.data(), n_embd, 2); - res->embedding.push_back(embd_res); - } else { - res->embedding.push_back({ embd, embd + n_embd }); - } + llama_embd_normalize(embd, embd_res.data(), n_embd); + + res.data = json { + {"embedding", embd_res}, + }; } - SLT_DBG(slot, "%s", "sending embeddings\n"); - - queue_results.send(std::move(res)); + queue_results.send(res); } - void send_rerank(const server_slot & slot, const llama_batch & batch) { - auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; - res->n_tokens = slot.n_prompt_tokens; + void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding) { + server_task task; + task.id = id_task; + task.id_multi = id_multi; + task.id_target = 0; + task.data = std::move(data); + task.infill = infill; + task.embedding = embedding; + task.type = SERVER_TASK_TYPE_COMPLETION; - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } - - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } - - if (embd == NULL) { - SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - - res->score = -1e6; - continue; - } - - res->score = embd[0]; - } - - SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score); - - queue_results.send(std::move(res)); - } - - // - // Functions to create new task(s) and receive result(s) - // - - void cancel_tasks(const std::unordered_set & id_tasks) { - std::vector cancel_tasks; - cancel_tasks.reserve(id_tasks.size()); - for (const auto & id_task : id_tasks) { - SRV_WRN("cancel task, id_task = %d\n", id_task); - - server_task task(SERVER_TASK_TYPE_CANCEL); - task.id_target = id_task; - queue_results.remove_waiting_task_id(id_task); - cancel_tasks.push_back(task); - } - // push to beginning of the queue, so it has highest priority - queue_tasks.post(cancel_tasks, true); - } - - // receive the results from task(s) - void receive_multi_results( - const std::unordered_set & id_tasks, - const std::function&)> & result_handler, - const std::function & error_handler, - const std::function & is_connection_closed) { - std::vector results(id_tasks.size()); - for (int i = 0; i < (int)id_tasks.size(); i++) { - server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS); - - if (is_connection_closed()) { - cancel_tasks(id_tasks); - return; - } - - if (result == nullptr) { - i--; // retry - continue; - } - - if (result->is_error()) { - error_handler(result->to_json()); - cancel_tasks(id_tasks); - return; - } - - GGML_ASSERT( - dynamic_cast(result.get()) != nullptr - || dynamic_cast(result.get()) != nullptr - || dynamic_cast(result.get()) != nullptr - ); - const size_t idx = result->get_index(); - GGML_ASSERT(idx < results.size() && "index out of range"); - results[idx] = std::move(result); - } - result_handler(results); - } - - // receive the results from task(s), in stream mode - void receive_cmpl_results_stream( - const std::unordered_set & id_tasks, - const std::function & result_handler, - const std::function & error_handler, - const std::function & is_connection_closed) { - size_t n_finished = 0; - while (true) { - server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS); - - if (is_connection_closed()) { - cancel_tasks(id_tasks); - return; - } - - if (result == nullptr) { - continue; // retry - } - - if (result->is_error()) { - error_handler(result->to_json()); - cancel_tasks(id_tasks); - return; - } - - GGML_ASSERT( - dynamic_cast(result.get()) != nullptr - || dynamic_cast(result.get()) != nullptr - ); - if (!result_handler(result)) { - cancel_tasks(id_tasks); - break; - } - - if (result->is_stop()) { - if (++n_finished == id_tasks.size()) { + // when a completion task's prompt array is not a singleton, we split it into multiple requests + // otherwise, it's a single-prompt task, we actually queue it + // if there's numbers in the prompt array it will be treated as an array of tokens + if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) { + bool numbers = false; + for (const auto & e : task.data.at("prompt")) { + if (e.is_number()) { + numbers = true; break; } } + + // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers, + // it will completely stall the server. I don't know where the bug for this is. + // + // if there are numbers, it needs to be treated like a single prompt, + // queue_tasks handles a mix of strings and numbers just fine. + if (numbers) { + queue_tasks.post(task); + } else { + split_multiprompt_task(id_task, task); + } + } else { + queue_tasks.post(task); } } - // - // Functions to process the task - // + void request_cancel(int id_task) { + server_task task; + task.type = SERVER_TASK_TYPE_CANCEL; + task.id_target = id_task; - void process_single_task(server_task task) { + queue_tasks.post(task); + } + + void split_multiprompt_task(int id_multi, const server_task & multiprompt_task) { + const int prompt_count = multiprompt_task.data.at("prompt").size(); + if (prompt_count <= 1) { + send_error(multiprompt_task, "error while handling multiple prompts"); + return; + } + + // generate all the ID for subtask + std::vector subtask_ids(prompt_count); + for (int i = 0; i < prompt_count; i++) { + subtask_ids[i] = queue_tasks.get_new_id(); + } + + // queue up the multitask so we can track its subtask progression + queue_tasks.add_multitask(id_multi, subtask_ids); + + // add subtasks + for (int i = 0; i < prompt_count; i++) { + json subtask_data = multiprompt_task.data; + subtask_data["prompt"] = subtask_data.at("prompt")[i]; + + // subtasks inherit everything else (infill mode, embedding mode, etc.) + request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding); + } + } + + void process_single_task(const server_task & task) { switch (task.type) { case SERVER_TASK_TYPE_COMPLETION: - case SERVER_TASK_TYPE_INFILL: - case SERVER_TASK_TYPE_EMBEDDING: - case SERVER_TASK_TYPE_RERANK: { - const int id_slot = task.id_selected_slot; + const int id_slot = json_value(task.data, "id_slot", -1); - server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); + server_slot * slot; + + if (id_slot != -1) { + slot = get_slot_by_id(id_slot); + } else { + std::string prompt; + if (task.data.contains("prompt") && task.data.at("prompt").is_string()) { + prompt = json_value(task.data, "prompt", std::string()); + } + + slot = get_available_slot(prompt); + } if (slot == nullptr) { // if no slot is available, we defer this task for processing later - SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); + LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); queue_tasks.defer(task); break; } - if (slot->is_processing()) { + if (!slot->available()) { // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); queue_tasks.defer(task); break; } + if (task.data.contains("system_prompt")) { + std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); + system_prompt_set(sys_prompt); + + for (server_slot & slot : slots) { + slot.n_past = 0; + slot.n_past_se = 0; + } + } + + slot->reset(); + + slot->id_task = task.id; + slot->id_multi = task.id_multi; + slot->infill = task.infill; + slot->embedding = task.embedding; + if (!launch_slot_with_task(*slot, task)) { - SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); + LOG_ERROR("error while launching slot", task.data); break; } } break; @@ -2629,58 +1660,85 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = slot.to_json(); + json slot_data = get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["id_task"] = slot.id_task; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"n_decoded", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; - if (slot.is_processing()) { - n_processing_slots++; - } else { + if (slot_data["state"] == SLOT_STATE_IDLE) { n_idle_slots++; + } else { + n_processing_slots++; } slots_data.push_back(slot_data); } - SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); + LOG_INFO("slot data", { + {"id_task", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} + }); - auto res = std::make_unique(); - res->id = task.id; - res->slots_data = std::move(slots_data); - res->n_idle_slots = n_idle_slots; - res->n_processing_slots = n_processing_slots; - res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); - res->t_start = metrics.t_start; + LOG_VERBOSE("slot data", { + {"id_task", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} + }); - res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); - res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); + server_task_result res; + res.id = task.id; + res.id_multi = task.id_multi; + res.stop = true; + res.error = false; + res.data = { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", queue_tasks.queue_tasks_deferred.size() }, + { "t_start", metrics.t_start}, - res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; - res->t_prompt_processing_total = metrics.t_prompt_processing_total; - res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; - res->t_tokens_generation_total = metrics.t_tokens_generation_total; + { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, + { "t_tokens_generation_total", metrics.t_tokens_generation_total}, + { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, + { "t_prompt_processing_total", metrics.t_prompt_processing_total}, - res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; - res->t_prompt_processing = metrics.t_prompt_processing; - res->n_tokens_predicted = metrics.n_tokens_predicted; - res->t_tokens_generation = metrics.t_tokens_generation; + { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, + { "t_prompt_processing", metrics.t_prompt_processing}, + { "n_tokens_predicted", metrics.n_tokens_predicted}, + { "t_tokens_generation", metrics.t_tokens_generation}, - res->n_decode_total = metrics.n_decode_total; - res->n_busy_slots_total = metrics.n_busy_slots_total; + { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, + { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, - if (task.metrics_reset_bucket) { + { "slots", slots_data }, + }; + + if (json_value(task.data, "reset_bucket", false)) { metrics.reset_bucket(); } - queue_results.send(std::move(res)); + queue_results.send(res); } break; case SERVER_TASK_TYPE_SLOT_SAVE: { - int id_slot = task.slot_action.slot_id; + int id_slot = task.data.at("id_slot"); server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } - if (slot->is_processing()) { + if (!slot->available()) { // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); queue_tasks.defer(task); break; } @@ -2688,47 +1746,52 @@ struct server_context { const size_t token_count = slot->cache_tokens.size(); const int64_t t_start = ggml_time_us(); - std::string filename = task.slot_action.filename; - std::string filepath = task.slot_action.filepath; + std::string filename = task.data.at("filename"); + std::string filepath = task.data.at("filepath"); - const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count); + const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count); const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; - auto res = std::make_unique(); - res->id = task.id; - res->id_slot = id_slot; - res->filename = filename; - res->is_save = true; - res->n_tokens = token_count; - res->n_bytes = nwrite; - res->t_ms = t_save_ms; - queue_results.send(std::move(res)); + server_task_result result; + result.id = task.id; + result.stop = true; + result.error = false; + result.data = json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_saved", token_count }, // tokens saved + { "n_written", nwrite }, // bytes written + { "timings", { + { "save_ms", t_save_ms } + } } + }; + queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_RESTORE: { - int id_slot = task.slot_action.slot_id; + int id_slot = task.data.at("id_slot"); server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } - if (slot->is_processing()) { + if (!slot->available()) { // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); queue_tasks.defer(task); break; } const int64_t t_start = ggml_time_us(); - std::string filename = task.slot_action.filename; - std::string filepath = task.slot_action.filepath; + std::string filename = task.data.at("filename"); + std::string filepath = task.data.at("filepath"); slot->cache_tokens.resize(slot->n_ctx); size_t token_count = 0; - size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); + size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); if (nread == 0) { slot->cache_tokens.resize(0); send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); @@ -2739,67 +1802,114 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; - auto res = std::make_unique(); - res->id = task.id; - res->id_slot = id_slot; - res->filename = filename; - res->is_save = false; - res->n_tokens = token_count; - res->n_bytes = nread; - res->t_ms = t_restore_ms; - queue_results.send(std::move(res)); + server_task_result result; + result.id = task.id; + result.stop = true; + result.error = false; + result.data = json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_restored", token_count }, // tokens restored + { "n_read", nread }, // bytes read + { "timings", { + { "restore_ms", t_restore_ms } + } } + }; + queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_ERASE: { - int id_slot = task.slot_action.slot_id; + int id_slot = task.data.at("id_slot"); server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } - if (slot->is_processing()) { + if (!slot->available()) { // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); queue_tasks.defer(task); break; } // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); + llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1); slot->cache_tokens.clear(); - auto res = std::make_unique(); - res->id = task.id; - res->id_slot = id_slot; - res->n_erased = n_erased; - queue_results.send(std::move(res)); - } break; - case SERVER_TASK_TYPE_SET_LORA: - { - params_base.lora_adapters = std::move(task.set_lora); - auto res = std::make_unique(); - res->id = task.id; - queue_results.send(std::move(res)); + server_task_result result; + result.id = task.id; + result.stop = true; + result.error = false; + result.data = json { + { "id_slot", id_slot }, + { "n_erased", n_erased } + }; + queue_results.send(result); } break; } } + void on_finish_multitask(const server_task_multi & multitask) { + // all subtasks done == multitask is done + server_task_result result; + result.id = multitask.id; + result.stop = true; + result.error = false; + + // collect json results into one json result + std::vector result_jsons; + for (const auto & subres : multitask.results) { + result_jsons.push_back(subres.data); + result.error = result.error && subres.error; + } + result.data = json { + { "results", result_jsons } + }; + + queue_results.send(result); + } + void update_slots() { + if (system_need_update) { + system_prompt_update(); + } + + // release slots + for (auto & slot : slots) { + if (slot.command == SLOT_COMMAND_RELEASE) { + slot.state = SLOT_STATE_IDLE; + slot.command = SLOT_COMMAND_NONE; + slot.t_last_used = ggml_time_us(); + + LOG_INFO("slot released", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); + + queue_tasks.notify_slot_changed(); + } + } + // check if all slots are idle { bool all_idle = true; for (auto & slot : slots) { - if (slot.is_processing()) { + if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) { all_idle = false; break; } } if (all_idle) { - SRV_INF("%s", "all slots are idle\n"); - if (clean_kv_cache) { + LOG_INFO("all slots are idle", {}); + if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); } @@ -2808,75 +1918,71 @@ struct server_context { } { - SRV_DBG("%s", "posting NEXT_RESPONSE\n"); + LOG_VERBOSE("posting NEXT_RESPONSE", {}); + + server_task task; + task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; + task.id_target = -1; - server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); - task.id = queue_tasks.get_new_id(); queue_tasks.post(task); } // apply context-shift if needed // TODO: simplify and improve for (server_slot & slot : slots) { - if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { - if (!params_base.ctx_shift) { - // this check is redundant (for good) - // we should never get here, because generation should already stopped in process_token() - slot.release(); - send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); - continue; - } + if (slot.ga_n == 1) { + if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { + // Shift context + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; + const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - // Shift context - const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = slot.n_past - n_keep; - const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); + LOG_INFO("slot context shift", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_keep", n_keep}, + {"n_left", n_left}, + {"n_discard", n_discard}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()} + }); - SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + if (slot.params.cache_prompt) { + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } - if (slot.params.cache_prompt) { - for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); } - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + slot.n_past -= n_discard; + + slot.truncated = true; } - - slot.n_past -= n_discard; - - slot.truncated = true; } } // start populating the batch for this iteration - common_batch_clear(batch); - - // track if given slot can be batched with slots already in the batch - server_slot * slot_batched = nullptr; - - auto accept_special_token = [&](server_slot & slot, llama_token token) { - return params_base.special || slot.params.sampling.preserved_tokens.find(token) != slot.params.sampling.preserved_tokens.end(); - }; + llama_batch_clear(batch); // frist, add sampled tokens from any ongoing sequences for (auto & slot : slots) { - if (slot.state != SLOT_STATE_GENERATING) { - continue; - } - - // check if we can batch this slot with the previous one - if (!slot_batched) { - slot_batched = &slot; - } else if (!slot_batched->can_batch_with(slot)) { + if (slot.state == SLOT_STATE_IDLE) { continue; } slot.i_batch = batch.n_tokens; - common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); + const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + + // TODO: we always have to take into account the "system_tokens" + // this is not great and needs to be improved somehow + llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); slot.n_past += 1; @@ -2884,100 +1990,129 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n", - slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated); + LOG_VERBOSE("slot decode token", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); } // process in chunks of params.n_batch int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); - // next, batch any pending prompts without exceeding n_batch - if (params_base.cont_batching || batch.n_tokens == 0) { - for (auto & slot : slots) { - // check if we can batch this slot with the previous one - if (slot.is_processing()) { - if (!slot_batched) { - slot_batched = &slot; - } else if (!slot_batched->can_batch_with(slot)) { - continue; - } - } + // track if this is an embedding or non-embedding batch + // if we've added sampled tokens above, we are in non-embedding mode + // -1: none, 0: non-embedding, 1: embedding + int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; + // next, batch any pending prompts without exceeding n_batch + if (params.cont_batching || batch.n_tokens == 0) { + for (auto & slot : slots) { // this slot still has a prompt to be processed - if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { + if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) { auto & prompt_tokens = slot.prompt_tokens; - // TODO: maybe move branch to outside of this loop in the future - if (slot.state == SLOT_STATE_STARTED) { + // we haven't tokenized the prompt yet - do it now: + if (prompt_tokens.empty()) { + LOG_VERBOSE("tokenizing prompt", { + {"id_slot", slot.id}, + {"id_task", slot.id_task} + }); + slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; + if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); + bool suff_rm_leading_spc = true; + if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + + auto prefix_tokens = tokenize(slot.params.input_prefix, false); + auto suffix_tokens = tokenize(slot.params.input_suffix, false); + + const int space_token = 29871; // TODO: this should not be hardcoded + if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { + suffix_tokens.erase(suffix_tokens.begin()); + } + + prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + embd_inp.push_back(middle_token); + } + + prompt_tokens = embd_inp; + } else { + prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt + } + slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); - slot.state = SLOT_STATE_PROCESSING_PROMPT; - SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); - - // print prompt tokens (for debugging) - if (1) { - // first 16 tokens (avoid flooding logs) - for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - } - } else { - // all - for (int i = 0; i < (int) prompt_tokens.size(); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - } - } + LOG_VERBOSE("prompt tokenized", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, + }); // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { - SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); + LOG_INFO("empty prompt - releasing slot", { + {"id_slot", slot.id}, + {"id_task", slot.id_task} + }); + slot.state = SLOT_STATE_PROCESSING; + slot.command = SLOT_COMMAND_NONE; slot.release(); slot.print_timings(); send_final_response(slot); continue; } - if (slot.is_non_causal()) { + if (slot.embedding) { + // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_ubatch) { + slot.state = SLOT_STATE_PROCESSING; + slot.command = SLOT_COMMAND_NONE; slot.release(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); continue; } - - if (slot.n_prompt_tokens > slot.n_ctx) { - slot.release(); - send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER); - continue; - } } else { - if (!params_base.ctx_shift) { - // if context shift is disabled, we make sure prompt size is smaller than KV size - // TODO: there should be a separate parameter that control prompt truncation - // context shift should be applied only during the generation phase - if (slot.n_prompt_tokens >= slot.n_ctx) { - slot.release(); - send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); - continue; - } - } if (slot.params.n_keep < 0) { slot.params.n_keep = slot.n_prompt_tokens; } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it - if (slot.n_prompt_tokens >= slot.n_ctx) { + // if input prompt is too big, truncate it (if group attention self-extend is disabled) + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - llama_tokens new_tokens( + std::vector new_tokens( prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); @@ -2991,130 +2126,151 @@ struct server_context { slot.truncated = true; slot.n_prompt_tokens = prompt_tokens.size(); - SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); + LOG_VERBOSE("input truncated", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, + }); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - if (slot.params.cache_prompt) { + llama_sampling_reset(slot.ctx_sampling); + + if (!slot.params.cache_prompt) { + slot.n_past_se = 0; + slot.ga_i = 0; + } else { + GGML_ASSERT(slot.ga_n == 1); + // reuse any previously computed tokens that are common with the new prompt - slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); + slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params_base.n_cache_reuse > 0) { - size_t head_c = slot.n_past; // cache - size_t head_p = slot.n_past; // current prompt - - SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past); - - while (head_c < slot.cache_tokens.size() && - head_p < prompt_tokens.size()) { - - size_t n_match = 0; - while (head_c + n_match < slot.cache_tokens.size() && - head_p + n_match < prompt_tokens.size() && - slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { - - n_match++; - } - - if (n_match >= (size_t) params_base.n_cache_reuse) { - SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); - //for (size_t i = head_p; i < head_p + n_match; i++) { - // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - //} - - const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - - llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); - - for (size_t i = 0; i < n_match; i++) { - slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; - slot.n_past++; - } - - head_c += n_match; - head_p += n_match; - } else { - head_c += 1; - } - } - - SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + // push the prompt into the sampling context (do not apply grammar) + for (int i = 0; i < slot.n_past; ++i) { + llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); } } } if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "id_slot", slot.id }, + { "id_task", slot.id_task } + }); slot.n_past--; + if (slot.ga_i > 0) { + slot.n_past_se--; + } } slot.n_prompt_tokens_processed = 0; } - // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.is_non_causal()) { + if (slot.embedding) { // cannot fit the prompt in the current batch - will try next iter if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; } } - // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { - // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); - - // there is no common part left - slot.n_past = 0; + // check that we are in the right batch_type, if not defer the slot + bool slot_type = slot.embedding ? 1 : 0; + if (batch_type == -1) { + batch_type = slot_type; + } else if (batch_type != slot_type) { + continue; } - SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); + // keep only the common part + int p0 = (int) system_tokens.size() + slot.n_past; + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { + // could not partially delete (likely using a non-Transformer model) + llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + + p0 = (int) system_tokens.size(); + if (p0 != 0) { + // copy over the system prompt when there is one + llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); + } + + // there is no common part left (except for the system prompt) + slot.n_past = 0; + slot.n_past_se = 0; + slot.ga_i = 0; + // TODO: is the system prompt ever in the sampling context? + llama_sampling_reset(slot.ctx_sampling); + } // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - // add prompt tokens for processing in the current batch - while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - // without pooling, we want to output the embeddings for all the tokens in the batch - const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE; + LOG_INFO("kv cache rm [p0, end)", { + { "id_slot", slot.id }, + { "id_task", slot.id_task }, + { "p0", p0 } + }); - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd); + int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + + int32_t ga_i = slot.ga_i; + int32_t ga_n = slot.ga_n; + int32_t ga_w = slot.ga_w; + + // add prompt tokens for processing in the current batch + // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow + for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { + if (slot.ga_n != 1) { + while (slot_npast >= ga_i + ga_w) { + const int bd = (ga_w/ga_n)*(ga_n - 1); + slot_npast -= bd; + ga_i += ga_w/ga_n; + } + } + + llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); } slot.n_prompt_tokens_processed++; - slot.n_past++; + slot_npast++; } - SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); + LOG_VERBOSE("prompt processing progress", { + {"id_slot", slot.id}, + {"n_past", slot.n_past}, + {"n_ctx", n_ctx}, + {"n_tokens", batch.n_tokens}, + {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, + }); - // entire prompt has been processed + // entire prompt has been processed - start decoding new tokens if (slot.n_past == slot.n_prompt_tokens) { - slot.state = SLOT_STATE_DONE_PROMPT; + slot.state = SLOT_STATE_PROCESSING; + slot.command = SLOT_COMMAND_NONE; GGML_ASSERT(batch.n_tokens > 0); - common_sampler_reset(slot.smpl); - - // Process all prompt tokens through sampler system - for (int i = 0; i < slot.n_prompt_tokens; ++i) { - common_sampler_accept(slot.smpl, prompt_tokens[i], false); - } - // extract the logits only for the last token batch.logits[batch.n_tokens - 1] = true; slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1; - SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); + LOG_VERBOSE("prompt done", { + {"id_slot", slot.id}, + {"n_past", slot.n_past}, + {"n_ctx", n_ctx}, + {"n_tokens", batch.n_tokens}, + }); } } @@ -3125,23 +2281,50 @@ struct server_context { } if (batch.n_tokens == 0) { - SRV_WRN("%s", "no tokens to decode\n"); + LOG_VERBOSE("no tokens to decode", {}); return; } - SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); + LOG_VERBOSE("decoding batch", { + {"n_tokens", batch.n_tokens}, + }); - if (slot_batched) { - // make sure we're in the right embedding mode - llama_set_embeddings(ctx, slot_batched->is_non_causal()); - // apply lora, only need to do it once per batch - common_set_adapter_lora(ctx, slot_batched->lora); - } + // make sure we're in the right embedding mode + llama_set_embeddings(ctx, batch_type == 1); // process the created batch of tokens for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + for (auto & slot : slots) { + if (slot.ga_n != 1) { + // context extension via Self-Extend + // TODO: simplify and/or abstract this + while (slot.n_past_se >= slot.ga_i + slot.ga_w) { + const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; + const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); + const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; + + LOG_TEE("\n"); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + + llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); + llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); + + slot.n_past_se -= bd; + + slot.ga_i += slot.ga_w / slot.ga_n; + + LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + } + + slot.n_past_se += n_tokens; + } + } + llama_batch batch_view = { n_tokens, batch.token + i, @@ -3150,16 +2333,22 @@ struct server_context { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + 0, 0, 0, // unused }; const int ret = llama_decode(ctx, batch_view); - metrics.on_decoded(slots); if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { + {"i", i}, + {"n_batch", ret}, + {"ret", ret}, + }); for (auto & slot : slots) { + slot.state = SLOT_STATE_PROCESSING; + slot.command = SLOT_COMMAND_NONE; slot.release(); send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); } @@ -3170,179 +2359,92 @@ struct server_context { n_batch /= 2; i -= n_batch; - SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { + {"i", i}, + {"n_batch", n_batch}, + {"ret", ret}, + }); continue; // continue loop of n_batch } for (auto & slot : slots) { - if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { + if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { continue; // continue loop of slots } - if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.task_type == SERVER_TASK_TYPE_EMBEDDING) { - // prompt evaluated for embedding - send_embedding(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } - - if (slot.task_type == SERVER_TASK_TYPE_RERANK) { - send_rerank(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } - - // prompt evaluated for next-token prediction - slot.state = SLOT_STATE_GENERATING; - } else if (slot.state != SLOT_STATE_GENERATING) { + // prompt evaluated for embedding + if (slot.embedding) { + send_embedding(slot, batch_view); + slot.release(); + slot.i_batch = -1; continue; // continue loop of slots } - const int tok_idx = slot.i_batch - i; + completion_token_output result; + const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); - - slot.i_batch = -1; - - common_sampler_accept(slot.smpl, id, true); + llama_sampling_accept(slot.ctx_sampling, ctx, id, true); slot.n_decoded += 1; - - const int64_t t_current = ggml_time_us(); - if (slot.n_decoded == 1) { - slot.t_start_generation = t_current; + slot.t_start_generation = ggml_time_us(); slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; metrics.on_prompt_eval(slot); } - slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3; + llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; + result.tok = id; - completion_token_output result; - result.tok = id; - result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs + const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs); + if (n_probs > 0) { + const size_t n_valid = slot.ctx_sampling->n_valid; - if (slot.params.sampling.n_probs > 0) { - populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx); + // Make sure at least n_probs top tokens are at the front of the vector: + if (slot.sparams.temp == 0.0f && n_probs > n_valid) { + llama_sample_top_k(ctx, &cur_p, n_probs, 0); + } + + if (slot.sparams.temp == 0.0f) { + // With greedy sampling the probabilities have possibly not been calculated. + for (size_t i = 0; i < n_probs; ++i) { + result.probs.push_back({ + cur_p.data[i].id, + i == 0 ? 1.0f : 0.0f + }); + } + } else { + for (size_t i = 0; i < n_probs; ++i) { + result.probs.push_back({ + cur_p.data[i].id, + i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability. + }); + } + } } if (!process_token(result, slot)) { - // release slot because of stop condition slot.release(); slot.print_timings(); send_final_response(slot); metrics.on_prediction(slot); - continue; - } - } - - // do speculative decoding - for (auto & slot : slots) { - if (!slot.is_processing() || !slot.can_speculate()) { - continue; } - if (slot.state != SLOT_STATE_GENERATING) { - continue; - } - - // determine the max draft that fits the current slot state - int n_draft_max = slot.params.speculative.n_max; - - // note: n_past is not yet increased for the `id` token sampled above - // also, need to leave space for 1 extra token to allow context shifts - n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2); - - if (slot.n_remaining > 0) { - n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); - } - - SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); - - if (n_draft_max < slot.params.speculative.n_min) { - SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min); - - continue; - } - - llama_token id = slot.sampled; - - struct common_speculative_params params_spec; - params_spec.n_draft = n_draft_max; - params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; - params_spec.p_min = slot.params.speculative.p_min; - - llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id); - - // ignore small drafts - if (slot.params.speculative.n_min > (int) draft.size()) { - SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min); - - continue; - } - - // construct the speculation batch - common_batch_clear(slot.batch_spec); - common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true); - - for (size_t i = 0; i < draft.size(); ++i) { - common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); - } - - SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); - - llama_decode(ctx, slot.batch_spec); - - // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); - - slot.n_past += ids.size(); - slot.n_decoded += ids.size(); - - slot.cache_tokens.push_back(id); - slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); - - for (size_t i = 0; i < ids.size(); ++i) { - completion_token_output result; - - result.tok = ids[i]; - result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // set later - - // TODO: set result.probs - - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - break; - } - } - - SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past); + slot.i_batch = -1; } } - SRV_DBG("%s", "run slots completed\n"); + LOG_VERBOSE("run slots completed", {}); } json model_meta() const { return json { - {"vocab_type", llama_vocab_type (vocab)}, - {"n_vocab", llama_vocab_n_tokens (vocab)}, - {"n_ctx_train", llama_model_n_ctx_train(model)}, - {"n_embd", llama_model_n_embd (model)}, - {"n_params", llama_model_n_params (model)}, - {"size", llama_model_size (model)}, + {"vocab_type", llama_vocab_type (model)}, + {"n_vocab", llama_n_vocab (model)}, + {"n_ctx_train", llama_n_ctx_train (model)}, + {"n_embd", llama_n_embd (model)}, + {"n_params", llama_model_n_params(model)}, + {"size", llama_model_size (model)}, }; } }; @@ -3353,12 +2455,19 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp return; } - // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch + LOG_INFO("request", { + {"remote_addr", req.remote_addr}, + {"remote_port", req.remote_port}, + {"status", res.status}, + {"method", req.method}, + {"path", req.path}, + {"params", req.params}, + }); - SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - - SRV_DBG("request: %s\n", req.body.c_str()); - SRV_DBG("response: %s\n", res.body.c_str()); + LOG_VERBOSE("request", { + {"request", req.body}, + {"response", res.body}, + }); } std::function shutdown_handler; @@ -3376,78 +2485,96 @@ inline void signal_handler(int signal) { } int main(int argc, char ** argv) { +#if SERVER_VERBOSE != 1 + log_disable(); +#endif // own arguments required by this example - common_params params; + gpt_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - common_init(); + // TODO: not great to use extern vars + server_log_json = params.log_json; + server_verbose = params.verbosity > 0; // struct that contains llama context and inference server_context ctx_server; + if (!params.system_prompt.empty()) { + ctx_server.system_prompt_set(params.system_prompt); + } + + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + llama_backend_init(); llama_numa_init(params.numa); - LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); + LOG_INFO("build info", { + {"build", LLAMA_BUILD_NUMBER}, + {"commit", LLAMA_COMMIT} + }); + + LOG_INFO("system info", { + {"n_threads", params.n_threads}, + {"n_threads_batch", params.n_threads_batch}, + {"total_threads", std::thread::hardware_concurrency()}, + {"system_info", llama_print_system_info()}, + }); std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); + LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}}); svr.reset( new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) ); } else { - LOG_INF("Running without SSL\n"); + LOG_INFO("Running without SSL", {}); svr.reset(new httplib::Server()); } #else - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_ERR("Server is built without SSL support\n"); - return 1; - } svr.reset(new httplib::Server()); #endif std::atomic state{SERVER_STATE_LOADING_MODEL}; svr->set_default_headers({{"Server", "llama.cpp"}}); + + // CORS preflight + svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + return res.set_content("", "application/json; charset=utf-8"); + }); + svr->set_logger(log_server_request); - auto res_error = [](httplib::Response & res, const json & error_data) { + auto res_error = [](httplib::Response & res, json error_data) { json final_response {{"error", error_data}}; - res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON); + res.set_content(final_response.dump(), "application/json; charset=utf-8"); res.status = json_value(error_data, "code", 500); }; - auto res_ok = [](httplib::Response & res, const json & data) { - res.set_content(safe_json_to_str(data), MIMETYPE_JSON); - res.status = 200; - }; - - svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { + svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { std::string message; try { - std::rethrow_exception(ep); - } catch (const std::exception & e) { + std::rethrow_exception(std::move(ep)); + } catch (std::exception & e) { message = e.what(); } catch (...) { message = "Unknown Exception"; } - try { - json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); - res_error(res, formatted_error); - } catch (const std::exception & e) { - LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); - } + json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); + LOG_VERBOSE("Got exception", formatted_error); + res_error(res, formatted_error); }); svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) { @@ -3461,6 +2588,11 @@ int main(int argc, char ** argv) { svr->set_read_timeout (params.timeout_read); svr->set_write_timeout(params.timeout_write); + if (!svr->bind_to_port(params.hostname, params.port)) { + fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port); + return 1; + } + std::unordered_map log_data; log_data["hostname"] = params.hostname; @@ -3476,15 +2608,54 @@ int main(int argc, char ** argv) { // Necessary similarity of prompt for slot selection ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; + // load the model + if (!ctx_server.load_model(params)) { + state.store(SERVER_STATE_ERROR); + return 1; + } else { + ctx_server.init(); + state.store(SERVER_STATE_READY); + } + + LOG_INFO("model loaded", {}); + + const auto model_meta = ctx_server.model_meta(); + + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (params.chat_template.empty()) { + if (!ctx_server.validate_model_chat_template()) { + LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); + params.chat_template = "chatml"; + } + } + + // print sample chat example to make it clear which template is used + { + LOG_INFO("chat template", { + {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, + {"built_in", params.chat_template.empty()}, + }); + } + // // Middlewares // auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - static const std::unordered_set public_endpoints = { - "/health", - "/models", - "/v1/models", + // TODO: should we apply API key to all endpoints, including "/health" and "/models"? + static const std::set protected_endpoints = { + "/props", + "/completion", + "/completions", + "/v1/completions", + "/chat/completions", + "/v1/chat/completions", + "/infill", + "/tokenize", + "/detokenize", + "/embedding", + "/embeddings", + "/v1/embeddings", }; // If API key is not set, skip validation @@ -3492,8 +2663,8 @@ int main(int argc, char ** argv) { return true; } - // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { + // If path is not in protected_endpoints list, skip validation + if (protected_endpoints.find(req.path) == protected_endpoints.end()) { return true; } @@ -3509,42 +2680,17 @@ int main(int argc, char ** argv) { } // API key is invalid or not provided + // TODO: make another middleware for CORS related logic + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - LOG_WRN("Unauthorized: Invalid API Key\n"); + LOG_WARNING("Unauthorized: Invalid API Key", {}); return false; }; - auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) { - server_state current_state = state.load(); - if (current_state == SERVER_STATE_LOADING_MODEL) { - auto tmp = string_split(req.path, '.'); - if (req.path == "/" || tmp.back() == "html") { - res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); - res.status = 503; - } else { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); - } - return false; - } - return true; - }; - // register server middlewares - svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - // If this is OPTIONS request, skip validation because browsers don't include Authorization header - if (req.method == "OPTIONS") { - res.set_header("Access-Control-Allow-Credentials", "true"); - res.set_header("Access-Control-Allow-Methods", "GET, POST"); - res.set_header("Access-Control-Allow-Headers", "*"); - res.set_content("", "text/html"); // blank response, no data - return httplib::Server::HandlerResponse::Handled; // skip further processing - } - if (!middleware_server_state(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } + svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) { if (!middleware_validate_api_key(req, res)) { return httplib::Server::HandlerResponse::Handled; } @@ -3555,126 +2701,157 @@ int main(int argc, char ** argv) { // Route handlers (or controllers) // - const auto handle_health = [&](const httplib::Request &, httplib::Response & res) { - // error and loading states are handled by middleware - json health = {{"status", "ok"}}; - res_ok(res, health); + const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) { + server_state current_state = state.load(); + switch (current_state) { + case SERVER_STATE_READY: + { + // request slots data using task queue + server_task task; + task.id = ctx_server.queue_tasks.get_new_id(); + task.type = SERVER_TASK_TYPE_METRICS; + task.id_target = -1; + + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); + + // get the result + server_task_result result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); + + const int n_idle_slots = result.data.at("idle"); + const int n_processing_slots = result.data.at("processing"); + + json health = { + {"status", "ok"}, + {"slots_idle", n_idle_slots}, + {"slots_processing", n_processing_slots} + }; + + res.status = 200; // HTTP OK + if (params.endpoint_slots && req.has_param("include_slots")) { + health["slots"] = result.data.at("slots"); + } + + if (n_idle_slots == 0) { + health["status"] = "no slot available"; + if (req.has_param("fail_on_no_slot")) { + res.status = 503; // HTTP Service Unavailable + } + } + + res.set_content(health.dump(), "application/json"); + break; + } + case SERVER_STATE_LOADING_MODEL: + { + res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + } break; + case SERVER_STATE_ERROR: + { + res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER)); + } break; + } }; - const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { + const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) { if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED)); return; } // request slots data using task queue - server_task task(SERVER_TASK_TYPE_METRICS); + server_task task; task.id = ctx_server.queue_tasks.get_new_id(); + task.id_multi = -1; + task.id_target = -1; + task.type = SERVER_TASK_TYPE_METRICS; + ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task, true); // high-priority task + ctx_server.queue_tasks.post(task); // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + server_task_result result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - // TODO: get rid of this dynamic_cast - auto res_metrics = dynamic_cast(result.get()); - GGML_ASSERT(res_metrics != nullptr); - - // optionally return "fail_on_no_slot" error - if (req.has_param("fail_on_no_slot")) { - if (res_metrics->n_idle_slots == 0) { - res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); - return; - } - } - - res_ok(res, res_metrics->slots_data); + res.set_content(result.data.at("slots").dump(), "application/json"); + res.status = 200; // HTTP OK }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { if (!params.endpoint_metrics) { - res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED)); return; } // request slots data using task queue - server_task task(SERVER_TASK_TYPE_METRICS); + server_task task; task.id = ctx_server.queue_tasks.get_new_id(); - task.metrics_reset_bucket = true; + task.id_multi = -1; + task.id_target = -1; + task.type = SERVER_TASK_TYPE_METRICS; + task.data.push_back({{"reset_bucket", true}}); ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task, true); // high-priority task + ctx_server.queue_tasks.post(task); // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + server_task_result result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } + json data = result.data; - // TODO: get rid of this dynamic_cast - auto res_metrics = dynamic_cast(result.get()); - GGML_ASSERT(res_metrics != nullptr); + const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed"); + const uint64_t t_prompt_processing = data.at("t_prompt_processing"); + + const uint64_t n_tokens_predicted = data.at("n_tokens_predicted"); + const uint64_t t_tokens_generation = data.at("t_tokens_generation"); + + const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names json all_metrics_def = json { {"counter", {{ {"name", "prompt_tokens_total"}, {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) res_metrics->n_prompt_tokens_processed_total} + {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")} }, { {"name", "prompt_seconds_total"}, {"help", "Prompt process time"}, - {"value", (uint64_t) res_metrics->t_prompt_processing_total / 1.e3} + {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3} }, { {"name", "tokens_predicted_total"}, {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) res_metrics->n_tokens_predicted_total} + {"value", (uint64_t) data.at("n_tokens_predicted_total")} }, { {"name", "tokens_predicted_seconds_total"}, {"help", "Predict process time"}, - {"value", (uint64_t) res_metrics->t_tokens_generation_total / 1.e3} - }, { - {"name", "n_decode_total"}, - {"help", "Total number of llama_decode() calls"}, - {"value", res_metrics->n_decode_total} - }, { - {"name", "n_busy_slots_per_decode"}, - {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total} + {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, {"help", "Average prompt throughput in tokens/s."}, - {"value", res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.} + {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} },{ {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, - {"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.} + {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} },{ {"name", "kv_cache_usage_ratio"}, {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx} + {"value", 1. * kv_cache_used_cells / params.n_ctx} },{ {"name", "kv_cache_tokens"}, {"help", "KV-cache tokens."}, - {"value", (uint64_t) res_metrics->kv_cache_tokens_count} + {"value", (uint64_t) data.at("kv_cache_tokens_count")} },{ {"name", "requests_processing"}, - {"help", "Number of requests processing."}, - {"value", (uint64_t) res_metrics->n_processing_slots} + {"help", "Number of request processing."}, + {"value", (uint64_t) data.at("processing")} },{ {"name", "requests_deferred"}, - {"help", "Number of requests deferred."}, - {"value", (uint64_t) res_metrics->n_tasks_deferred} + {"help", "Number of request deferred."}, + {"value", (uint64_t) data.at("deferred")} }}} }; @@ -3695,13 +2872,14 @@ int main(int argc, char ** argv) { } } - res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start)); + const int64_t t_start = data.at("t_start"); + res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.status = 200; // HTTP OK }; - const auto handle_slots_save = [&ctx_server, &res_error, &res_ok, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { + const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { @@ -3710,27 +2888,28 @@ int main(int argc, char ** argv) { } std::string filepath = params.slot_save_path + filename; - server_task task(SERVER_TASK_TYPE_SLOT_SAVE); - task.id = ctx_server.queue_tasks.get_new_id(); - task.slot_action.slot_id = id_slot; - task.slot_action.filename = filename; - task.slot_action.filepath = filepath; + server_task task; + task.type = SERVER_TASK_TYPE_SLOT_SAVE; + task.data = { + { "id_slot", id_slot }, + { "filename", filename }, + { "filepath", filepath } + }; - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); + const int id_task = ctx_server.queue_tasks.post(task); + ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result_ptr result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result->is_error()) { - res_error(res, result->to_json()); - return; + if (result.error) { + res_error(res, result.data); + } else { + res.set_content(result.data.dump(), "application/json"); } - - res_ok(res, result->to_json()); }; - const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { + const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { @@ -3739,52 +2918,49 @@ int main(int argc, char ** argv) { } std::string filepath = params.slot_save_path + filename; - server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); - task.id = ctx_server.queue_tasks.get_new_id(); - task.slot_action.slot_id = id_slot; - task.slot_action.filename = filename; - task.slot_action.filepath = filepath; + server_task task; + task.type = SERVER_TASK_TYPE_SLOT_RESTORE; + task.data = { + { "id_slot", id_slot }, + { "filename", filename }, + { "filepath", filepath } + }; - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); + const int id_task = ctx_server.queue_tasks.post(task); + ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result_ptr result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result->is_error()) { - res_error(res, result->to_json()); - return; + if (result.error) { + res_error(res, result.data); + } else { + res.set_content(result.data.dump(), "application/json"); } - - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); }; - const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { - server_task task(SERVER_TASK_TYPE_SLOT_ERASE); - task.id = ctx_server.queue_tasks.get_new_id(); - task.slot_action.slot_id = id_slot; + const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { + server_task task; + task.type = SERVER_TASK_TYPE_SLOT_ERASE; + task.data = { + { "id_slot", id_slot }, + }; - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); + const int id_task = ctx_server.queue_tasks.post(task); + ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result_ptr result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result->is_error()) { - res_error(res, result->to_json()); - return; + if (result.error) { + res_error(res, result.data); + } else { + res.set_content(result.data.dump(), "application/json"); } - - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); }; - const auto handle_slots_action = [¶ms, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { - if (params.slot_save_path.empty()) { - res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } + const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); std::string id_slot_str = req.path_params.at("id_slot"); int id_slot; @@ -3809,614 +2985,403 @@ int main(int argc, char ** argv) { } }; - const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { - // this endpoint is publicly available, please only return what is safe to be exposed + const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + std::string template_key = "tokenizer.chat_template", curr_tmpl; + int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); + if (tlen > 0) { + std::vector curr_tmpl_buf(tlen + 1, 0); + if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { + curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); + } + } + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { + { "system_prompt", ctx_server.system_prompt.c_str() }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, - { "total_slots", ctx_server.params_base.n_parallel }, - { "model_path", ctx_server.params_base.model }, - { "chat_template", ctx_server.chat_templates.template_default->source() }, - { "bos_token", ctx_server.chat_templates.template_default->bos_token() }, - { "eos_token", ctx_server.chat_templates.template_default->eos_token() }, - { "build_info", build_info }, + { "total_slots", ctx_server.params.n_parallel }, + { "chat_template", curr_tmpl.c_str() } }; - if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) { - data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source(); - } - res_ok(res, data); + res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params_base.endpoint_props) { - res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - json data = json::parse(req.body); - - // update any props here - - res_ok(res, {{ "success", true }}); - }; - - // handle completion-like requests (completion, chat, infill) - // we can optionally provide a custom format for partial results and final results - const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok]( - server_task_type type, - json & data, - std::function is_connection_closed, - httplib::Response & res, - oaicompat_type oaicompat) { - GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); - - if (ctx_server.params_base.embedding) { + const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + if (ctx_server.params.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } - auto completion_id = gen_chatcmplid(); - std::vector tasks; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - try { - const auto & prompt = data.at("prompt"); - // TODO: this log can become very long, put it behind a flag or think about a more compact format - //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + json data = json::parse(req.body); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); - tasks.reserve(tokenized_prompts.size()); - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(type); + const int id_task = ctx_server.queue_tasks.get_new_id(); - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; + ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.request_completion(id_task, -1, data, false, false); - task.prompt_tokens = std::move(tokenized_prompts[i]); - task.params = server_task::params_from_json_cmpl( - ctx_server.ctx, - ctx_server.params_base, - data); - task.id_selected_slot = json_value(data, "id_slot", -1); - - // OAI-compat - task.params.oaicompat = oaicompat; - task.params.oaicompat_cmpl_id = completion_id; - // oaicompat_model is already populated by params_from_json_cmpl - - tasks.push_back(task); + if (!json_value(data, "stream", false)) { + server_task_result result = ctx_server.queue_results.recv(id_task); + if (!result.error && result.stop) { + res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); + } else { + res_error(res, result.data); } - } catch (const std::exception & e) { - res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); - return; - } - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(tasks); - - bool stream = json_value(data, "stream", false); - const auto task_ids = server_task::get_list_id(tasks); - - if (!stream) { - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - if (results.size() == 1) { - // single result - res_ok(res, results[0]->to_json()); - } else { - // multiple results (multitask) - json arr = json::array(); - for (auto & res : results) { - arr.push_back(res->to_json()); - } - res_ok(res, arr); - } - }, [&](const json & error_data) { - res_error(res, error_data); - }, is_connection_closed); - - ctx_server.queue_results.remove_waiting_task_ids(task_ids); + ctx_server.queue_results.remove_waiting_task_id(id_task); } else { - const auto chunked_content_provider = [task_ids, &ctx_server, oaicompat](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { - json res_json = result->to_json(); - if (res_json.is_array()) { - for (const auto & res : res_json) { - if (!server_sent_event(sink, "data", res)) { - // sending failed (HTTP connection closed), cancel the generation - return false; - } + const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) { + while (true) { + server_task_result result = ctx_server.queue_results.recv(id_task); + if (!result.error) { + const std::string str = + "data: " + + result.data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + if (!sink.write(str.c_str(), str.size())) { + ctx_server.queue_results.remove_waiting_task_id(id_task); + return false; + } + + if (result.stop) { + break; } - return true; } else { - return server_sent_event(sink, "data", res_json); + const std::string str = + "error: " + + result.data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + if (!sink.write(str.c_str(), str.size())) { + ctx_server.queue_results.remove_waiting_task_id(id_task); + return false; + } + + break; } - }, [&](const json & error_data) { - server_sent_event(sink, "error", error_data); - }, [&sink]() { - // note: do not use req.is_connection_closed here because req is already destroyed - return !sink.is_writable(); - }); - if (oaicompat != OAICOMPAT_TYPE_NONE) { - static const std::string ev_done = "data: [DONE]\n\n"; - sink.write(ev_done.data(), ev_done.size()); } + + ctx_server.queue_results.remove_waiting_task_id(id_task); sink.done(); - return false; + + return true; }; - auto on_complete = [task_ids, &ctx_server] (bool) { - ctx_server.queue_results.remove_waiting_task_ids(task_ids); + auto on_complete = [id_task, &ctx_server] (bool) { + // cancel + ctx_server.request_cancel(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); }; res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); } }; - const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - json data = json::parse(req.body); - return handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_NONE); - }; + const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - json data = oaicompat_completion_params_parse(json::parse(req.body)); - return handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_COMPLETION); - }; - - const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - // check model compatibility - std::string err; - if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "prefix token is missing. "; - } - if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "suffix token is missing. "; - } - if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "middle token is missing. "; - } - if (!err.empty()) { - res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - json data = json::parse(req.body); - - // validate input - if (data.contains("prompt") && !data.at("prompt").is_string()) { - // prompt is optional - res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); - } - - if (!data.contains("input_prefix")) { - res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); - } - - if (!data.contains("input_suffix")) { - res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); - } - - if (data.contains("input_extra") && !data.at("input_extra").is_array()) { - // input_extra is optional - res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - json input_extra = json_value(data, "input_extra", json::array()); - for (const auto & chunk : input_extra) { - // { "text": string, "filename": string } - if (!chunk.contains("text") || !chunk.at("text").is_string()) { - res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); - return; - } - // filename is optional - if (chunk.contains("filename") && !chunk.at("filename").is_string()) { - res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - data["input_extra"] = input_extra; // default to empty array if it's not exist - - std::string prompt = json_value(data, "prompt", std::string()); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true); - SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - data["prompt"] = format_infill( - ctx_server.vocab, - data.at("input_prefix"), - data.at("input_suffix"), - data.at("input_extra"), - ctx_server.params_base.n_batch, - ctx_server.params_base.n_predict, - ctx_server.slots[0].n_ctx, // TODO: there should be a better way - ctx_server.params_base.spm_infill, - tokenized_prompts[0] - ); - - return handle_completions_impl( - SERVER_TASK_TYPE_INFILL, - data, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_NONE); // infill is not OAI compatible - }; - - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - LOG_DBG("request: %s\n", req.body.c_str()); - if (ctx_server.params_base.embedding) { - res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - auto body = json::parse(req.body); - json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates); - - return handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_CHAT); - }; - - // same with handle_chat_completions, but without inference part - const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { - auto body = json::parse(req.body); - json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates); - res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); - }; - - const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { json models = { {"object", "list"}, {"data", { - { - {"id", params.model_alias.empty() ? params.model : params.model_alias}, - {"object", "model"}, - {"created", std::time(0)}, - {"owned_by", "llamacpp"}, - {"meta", ctx_server.model_meta()} - }, + { + {"id", params.model_alias}, + {"object", "model"}, + {"created", std::time(0)}, + {"owned_by", "llamacpp"}, + {"meta", model_meta} + }, }} }; - res_ok(res, models); + res.set_content(models.dump(), "application/json; charset=utf-8"); }; - const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - - json tokens_response = json::array(); - if (body.count("content") != 0) { - const bool add_special = json_value(body, "add_special", false); - const bool with_pieces = json_value(body, "with_pieces", false); - - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true); - - if (with_pieces) { - for (const auto& token : tokens) { - std::string piece = common_token_to_piece(ctx_server.ctx, token); - json piece_json; - - // Check if the piece is valid UTF-8 - if (is_valid_utf8(piece)) { - piece_json = piece; - } else { - // If not valid UTF-8, store as array of byte values - piece_json = json::array(); - for (unsigned char c : piece) { - piece_json.push_back(static_cast(c)); - } - } - - tokens_response.push_back({ - {"id", token}, - {"piece", piece_json} - }); - } - } else { - tokens_response = tokens; - } + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return; } - const json data = format_tokenizer_response(tokens_response); - res_ok(res, data); + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); + + const int id_task = ctx_server.queue_tasks.get_new_id(); + + ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.request_completion(id_task, -1, data, false, false); + + const auto completion_id = gen_chatcmplid(); + if (!json_value(data, "stream", false)) { + server_task_result result = ctx_server.queue_results.recv(id_task); + + if (!result.error && result.stop) { + json result_oai = format_final_response_oaicompat(data, result.data, completion_id); + + res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); + } else { + res_error(res, result.data); + } + ctx_server.queue_results.remove_waiting_task_id(id_task); + } else { + const auto chunked_content_provider = [id_task, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { + while (true) { + server_task_result result = ctx_server.queue_results.recv(id_task); + if (!result.error) { + std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); + + for (auto it = result_array.begin(); it != result_array.end(); ++it) { + if (!it->empty()) { + const std::string str = + "data: " + + it->dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + ctx_server.queue_results.remove_waiting_task_id(id_task); + return false; + } + } + } + if (result.stop) { + break; + } + } else { + const std::string str = + "error: " + + result.data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + ctx_server.queue_results.remove_waiting_task_id(id_task); + return false; + } + break; + } + } + sink.done(); + ctx_server.queue_results.remove_waiting_task_id(id_task); + return true; + }; + + auto on_complete = [id_task, &ctx_server](bool) { + // cancel request + ctx_server.request_cancel(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + }; + + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); + } }; - const auto handle_detokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + + json data = json::parse(req.body); + + const int id_task = ctx_server.queue_tasks.get_new_id(); + + ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.request_completion(id_task, -1, data, true, false); + + if (!json_value(data, "stream", false)) { + server_task_result result = ctx_server.queue_results.recv(id_task); + if (!result.error && result.stop) { + res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); + } else { + res_error(res, result.data); + } + + ctx_server.queue_results.remove_waiting_task_id(id_task); + } else { + const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) { + while (true) { + server_task_result result = ctx_server.queue_results.recv(id_task); + if (!result.error) { + const std::string str = + "data: " + + result.data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + if (!sink.write(str.c_str(), str.size())) { + ctx_server.queue_results.remove_waiting_task_id(id_task); + return false; + } + + if (result.stop) { + break; + } + } else { + break; + } + } + + ctx_server.queue_results.remove_waiting_task_id(id_task); + sink.done(); + + return true; + }; + + auto on_complete = [id_task, &ctx_server] (bool) { + ctx_server.request_cancel(id_task); + }; + + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); + } + }; + + const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + const json body = json::parse(req.body); + + std::vector tokens; + if (body.count("content") != 0) { + const bool add_special = json_value(body, "add_special", false); + tokens = ctx_server.tokenize(body.at("content"), add_special); + } + const json data = format_tokenizer_response(tokens); + return res.set_content(data.dump(), "application/json; charset=utf-8"); + }; + + const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::string content; if (body.count("tokens") != 0) { - const llama_tokens tokens = body.at("tokens"); + const std::vector tokens = body.at("tokens"); content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); } const json data = format_detokenized_response(content); - res_ok(res, data); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) { + const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + const json body = json::parse(req.body); + bool is_openai = false; - if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { - res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - // for the shape of input/content, see tokenize_input_prompts() + // an input prompt can be a string or a list of tokens (integer) json prompt; if (body.count("input") != 0) { + is_openai = true; prompt = body.at("input"); - } else if (body.contains("content")) { - oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible - prompt = body.at("content"); + } else if (body.count("content") != 0) { + // with "content", we only support single prompt + prompt = std::vector{body.at("content")}; } else { res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); return; } - bool use_base64 = false; - if (body.count("encoding_format") != 0) { - const std::string& format = body.at("encoding_format"); - if (format == "base64") { - use_base64 = true; - } else if (format != "float") { - res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); - for (const auto & tokens : tokenized_prompts) { - // this check is necessary for models that do not add BOS token to the input - if (tokens.empty()) { - res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - // create and queue the task - json responses = json::array(); - bool error = false; + json responses; { - std::vector tasks; - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); - - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - task.prompt_tokens = std::move(tokenized_prompts[i]); - - // OAI-compat - task.params.oaicompat = oaicompat; - - tasks.push_back(task); - } - - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(tasks); + const int id_task = ctx_server.queue_tasks.get_new_id(); + ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.request_completion(id_task, -1, {{"prompt", prompt}}, false, true); // get the result - std::unordered_set task_ids = server_task::get_list_id(tasks); - - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - for (auto & res : results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + if (!result.error) { + if (result.data.count("results")) { + // result for multi-task + responses = result.data.at("results"); + } else { + // result for single task + responses = std::vector{result.data}; } - }, [&](const json & error_data) { - res_error(res, error_data); - error = true; - }, req.is_connection_closed); - - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - } - - if (error) { - return; + } else { + // error received, ignore everything else + res_error(res, result.data); + return; + } } // write JSON response - json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING - ? format_embeddings_response_oaicompat(body, responses, use_base64) - : json(responses); - res_ok(res, root); + json root = is_openai + ? format_embeddings_response_oaicompat(body, responses) + : responses[0]; + return res.set_content(root.dump(), "application/json; charset=utf-8"); }; - const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); - }; - - const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING); - }; - - const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) { - res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - const json body = json::parse(req.body); - - // TODO: implement - //int top_n = 1; - //if (body.count("top_n") != 1) { - // top_n = body.at("top_n"); - //} else { - // res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - // return; - //} - - json query; - if (body.count("query") == 1) { - query = body.at("query"); - if (!query.is_string()) { - res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } else { - res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - std::vector documents = json_value(body, "documents", std::vector()); - if (documents.empty()) { - res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0]; - - // create and queue the task - json responses = json::array(); - bool error = false; - { - std::vector tasks; - std::vector tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true); - tasks.reserve(tokenized_docs.size()); - for (size_t i = 0; i < tokenized_docs.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_RERANK); - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]); - tasks.push_back(task); - } - - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(tasks); - - // get the result - std::unordered_set task_ids = server_task::get_list_id(tasks); - - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - for (auto & res : results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); - } - }, [&](const json & error_data) { - res_error(res, error_data); - error = true; - }, req.is_connection_closed); - } - - if (error) { - return; - } - - // write JSON response - json root = format_response_rerank(body, responses); - res_ok(res, root); - }; - - const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { - json result = json::array(); - const auto & loras = ctx_server.params_base.lora_adapters; - for (size_t i = 0; i < loras.size(); ++i) { - auto & lora = loras[i]; - result.push_back({ - {"id", i}, - {"path", lora.path}, - {"scale", lora.scale}, - }); - } - res_ok(res, result); - res.status = 200; // HTTP OK - }; - - const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - if (!body.is_array()) { - res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); - return; - } - server_task task(SERVER_TASK_TYPE_SET_LORA); - task.id = ctx_server.queue_tasks.get_new_id(); - task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - server_task_result_ptr result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); + auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { + return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(content), len, mime_type); + return false; + }; }; // // Router // - if (!params.webui) { - LOG_INF("Web UI is disabled\n"); - } else { - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - bool is_found = svr->set_mount_point("/", params.public_path); - if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); - return 1; - } - } else { - // using embedded static index.html - svr->Get("/", [](const httplib::Request & req, httplib::Response & res) { - if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { - res.set_content("Error: gzip is not supported by this browser", "text/plain"); - } else { - res.set_header("Content-Encoding", "gzip"); - // COEP and COOP headers, required by pyodide (python interpreter) - res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); - res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); - } - return false; - }); - } + // register static assets routes + if (!params.public_path.empty()) { + // Set the base directory for serving static files + svr->set_base_dir(params.public_path); } + // using embedded static files + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + + // add new-ui files + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + // register API routes - svr->Get ("/health", handle_health); // public endpoint (no API key check) + svr->Get ("/health", handle_health); + svr->Get ("/slots", handle_slots); svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); - svr->Post("/props", handle_props_change); - svr->Get ("/models", handle_models); // public endpoint (no API key check) - svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) + svr->Get ("/v1/models", handle_models); svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); - svr->Post("/v1/completions", handle_completions_oai); + svr->Post("/v1/completions", handle_completions); svr->Post("/chat/completions", handle_chat_completions); svr->Post("/v1/chat/completions", handle_chat_completions); svr->Post("/infill", handle_infill); svr->Post("/embedding", handle_embeddings); // legacy svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings_oai); - svr->Post("/rerank", handle_rerank); - svr->Post("/reranking", handle_rerank); - svr->Post("/v1/rerank", handle_rerank); - svr->Post("/v1/reranking", handle_rerank); + svr->Post("/v1/embeddings", handle_embeddings); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); - svr->Post("/apply-template", handle_apply_template); - // LoRA adapters hotswap - svr->Get ("/lora-adapters", handle_lora_adapters_list); - svr->Post("/lora-adapters", handle_lora_adapters_apply); - // Save & load slots - svr->Get ("/slots", handle_slots); - svr->Post("/slots/:id_slot", handle_slots_action); + if (!params.slot_save_path.empty()) { + // only enable slot endpoints if slot_save_path is set + svr->Post("/slots/:id_slot", handle_slots_action); + } // // Start the server @@ -4428,75 +3393,36 @@ int main(int argc, char ** argv) { log_data["n_threads_http"] = std::to_string(params.n_threads_http); svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; - // clean up function, to be called before exit - auto clean_up = [&svr]() { - svr->stop(); - llama_backend_free(); - }; + LOG_INFO("HTTP server listening", log_data); - // bind HTTP listen port - bool was_bound = false; - if (params.port == 0) { - int bound_port = svr->bind_to_any_port(params.hostname); - if ((was_bound = (bound_port >= 0))) { - params.port = bound_port; + // run the HTTP server in a thread - see comment below + std::thread t([&]() { + if (!svr->listen_after_bind()) { + state.store(SERVER_STATE_ERROR); + return 1; } - } else { - was_bound = svr->bind_to_port(params.hostname, params.port); - } - if (!was_bound) { - //LOG_ERROR("couldn't bind HTTP server socket", { - // {"hostname", params.hostname}, - // {"port", params.port}, - //}); - LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); - clean_up(); - return 1; - } - - // run the HTTP server in a thread - std::thread t([&]() { svr->listen_after_bind(); }); - svr->wait_until_ready(); - - LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); - - // load the model - LOG_INF("%s: loading model\n", __func__); - - if (!ctx_server.load_model(params)) { - clean_up(); - t.join(); - LOG_ERR("%s: exiting due to model loading error\n", __func__); - return 1; - } - - ctx_server.init(); - state.store(SERVER_STATE_READY); - - LOG_INF("%s: model loaded\n", __func__); - - // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, - ctx_server.chat_templates.template_default->source().c_str(), - common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str()); - - ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) { - ctx_server.process_single_task(task); + return 0; }); - ctx_server.queue_tasks.on_update_slots([&ctx_server]() { - ctx_server.update_slots(); - }); + ctx_server.queue_tasks.on_new_task(std::bind( + &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_finish_multitask(std::bind( + &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( + &server_context::update_slots, &ctx_server)); + ctx_server.queue_results.on_multitask_update(std::bind( + &server_queue::update_multitask, + &ctx_server.queue_tasks, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3 + )); shutdown_handler = [&](int) { ctx_server.queue_tasks.terminate(); }; - LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); - - ctx_server.queue_tasks.start_loop(); - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; @@ -4511,8 +3437,12 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - clean_up(); + ctx_server.queue_tasks.start_loop(); + + svr->stop(); t.join(); + llama_backend_free(); + return 0; } diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore deleted file mode 100644 index 90ee7fe6d..000000000 --- a/examples/server/tests/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.venv -tmp diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 1de0eb30e..5e6cb277b 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -1,9 +1,19 @@ # Server tests -Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/). +Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) +and [behave](https://behave.readthedocs.io/en/latest/): + +* [issues.feature](./features/issues.feature) Pending issues scenario +* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests +* [security.feature](./features/security.feature) Security, CORS and API Key +* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... Tests target GitHub workflows job runners with 4 vCPU. +Requests are +using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) +based http client. + Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`. @@ -29,38 +39,27 @@ It's possible to override some scenario steps values with environment variables: |--------------------------|------------------------------------------------------------------------------------------------| | `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | | `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` | -| `DEBUG` | to enable steps and server verbose mode `--verbose` | +| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | +| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format | | `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | -| `LLAMA_CACHE` | by default server tests re-download models to the `tmp` subfolder. Set this to your cache (e.g. `$HOME/Library/Caches/llama.cpp` on Mac or `$HOME/.cache/llama.cpp` on Unix) to avoid this | -To run slow tests (will download many models, make sure to set `LLAMA_CACHE` if needed): +### Run @bug, @wip or @wrong_usage annotated scenario + +Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope. + +- `@bug` annotation aims to link a scenario with a GitHub issue. +- `@wrong_usage` are meant to show user issue that are actually an expected behavior +- `@wip` to focus on a scenario working in progress +- `@slow` heavy test, disabled by default + +To run a scenario annotated with `@bug`, start: ```shell -SLOW_TESTS=1 ./tests.sh +DEBUG=ON ./tests.sh --no-skipped --tags bug --stop ``` -To run with stdout/stderr display in real time (verbose output, but useful for debugging): +After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. ```shell -DEBUG=1 ./tests.sh -s -v -x +./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile" ``` - -To run all the tests in a file: - -```shell -./tests.sh unit/test_chat_completion.py.py -v -x -``` - -To run a single test: - -```shell -./tests.sh unit/test_chat_completion.py::test_invalid_chat_completion_req -``` - -Hint: You can compile and run test in single command, useful for local developement: - -```shell -cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh -``` - -To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html) diff --git a/examples/server/tests/conftest.py b/examples/server/tests/conftest.py deleted file mode 100644 index 017d1bb84..000000000 --- a/examples/server/tests/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -import pytest -from utils import * - - -# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test -@pytest.fixture(autouse=True) -def stop_server_after_each_test(): - # do nothing before each test - yield - # stop all servers after each test - instances = set( - server_instances - ) # copy the set to prevent 'Set changed size during iteration' - for server in instances: - server.stop() diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature new file mode 100644 index 000000000..6f163ce04 --- /dev/null +++ b/examples/server/tests/features/embeddings.feature @@ -0,0 +1,96 @@ +@llama.cpp +@embeddings +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf + And a model file bert-bge-small.gguf + And a model alias bert-bge-small + And 42 as server seed + And 2 slots + And 1024 as batch size + And 1024 as ubatch size + And 2048 KV cache size + And embeddings extraction + Then the server is starting + Then the server is healthy + + Scenario: Embedding + When embeddings are computed for: + """ + What is the capital of Bulgaria ? + """ + Then embeddings are generated + + Scenario: OAI Embeddings compatibility + Given a model bert-bge-small + When an OAI compatible embeddings computation request for: + """ + What is the capital of Spain ? + """ + Then embeddings are generated + + Scenario: OAI Embeddings compatibility with multiple inputs + Given a model bert-bge-small + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + When an OAI compatible embeddings computation request for multiple inputs + Then embeddings are generated + + Scenario: Multi users embeddings + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated + + Scenario: Multi users OAI compatibility embeddings + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + And a prompt: + """ + What is the biggest US city ? + """ + And a prompt: + """ + What is the capital of Bulgaria ? + """ + And a model bert-bge-small + Given concurrent OAI embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated + + Scenario: All embeddings should be the same + Given 10 fixed prompts + And a model bert-bge-small + Given concurrent OAI embedding requests + Then all embeddings are the same diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py new file mode 100644 index 000000000..e7845dc2f --- /dev/null +++ b/examples/server/tests/features/environment.py @@ -0,0 +1,71 @@ +import os +import signal +import socket +import sys +import time +import traceback +from contextlib import closing +from subprocess import TimeoutExpired + + +def before_scenario(context, scenario): + context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' + if context.debug: + print("DEBUG=ON") + print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") + port = 8080 + if 'PORT' in os.environ: + port = int(os.environ['PORT']) + if is_server_listening("localhost", port): + assert False, "Server already started" + + +def after_scenario(context, scenario): + try: + if 'server_process' not in context or context.server_process is None: + return + if scenario.status == "failed": + if 'GITHUB_ACTIONS' in os.environ: + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n") + if os.path.isfile('llama.log'): + with closing(open('llama.log', 'r')) as f: + for line in f: + print(line) + if not is_server_listening(context.server_fqdn, context.server_port): + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") + + if context.server_process.poll() is not None: + assert False, f"Server not running pid={context.server_process.pid} ..." + + server_graceful_shutdown(context) # SIGINT + + try: + context.server_process.wait(0.5) + except TimeoutExpired: + print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...") + context.server_process.kill() # SIGKILL + context.server_process.wait() + + while is_server_listening(context.server_fqdn, context.server_port): + time.sleep(0.1) + except Exception: + print("ignoring error in after_scenario:") + traceback.print_exc(file=sys.stdout) + + +def server_graceful_shutdown(context): + print(f"shutting down server pid={context.server_process.pid} ...") + if os.name == 'nt': + interrupt = signal.CTRL_C_EVENT + else: + interrupt = signal.SIGINT + context.server_process.send_signal(interrupt) + + +def is_server_listening(server_fqdn, server_port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((server_fqdn, server_port)) + _is_server_listening = result == 0 + if _is_server_listening: + print(f"server is listening on {server_fqdn}:{server_port}...") + return _is_server_listening diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature new file mode 100644 index 000000000..7b13e44ca --- /dev/null +++ b/examples/server/tests/features/issues.feature @@ -0,0 +1,5 @@ +# List of ongoing issues +# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug +@bug +Feature: Issues + # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature new file mode 100644 index 000000000..6cd306a2b --- /dev/null +++ b/examples/server/tests/features/parallel.feature @@ -0,0 +1,102 @@ +@llama.cpp +@parallel +Feature: Parallel + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models + And a model file test-model-00001-of-00003.gguf + And 42 as server seed + And 128 as batch size + And 256 KV cache size + And 2 slots + And continuous batching + Then the server is starting + Then the server is healthy + + Scenario Outline: Multi users completion + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And max tokens to predict + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all prompts are predicted with tokens + Examples: + | n_predict | + | 128 | + + Scenario Outline: Multi users OAI completions compatibility + Given a system prompt You are a writer. + And a model tinyllama-2 + Given a prompt: + """ + Write a very long book. + """ + And a prompt: + """ + Write another a poem. + """ + And max tokens to predict + And streaming is + Given concurrent OAI completions requests + Then the server is busy + Then the server is idle + Then all prompts are predicted with tokens + Examples: + | streaming | n_predict | + | disabled | 128 | + | enabled | 64 | + + Scenario Outline: Multi users OAI completions compatibility no v1 + Given a system prompt You are a writer. + And a model tinyllama-2 + Given a prompt: + """ + Write a very long book. + """ + And a prompt: + """ + Write another a poem. + """ + And max tokens to predict + And streaming is + Given concurrent OAI completions requests no v1 + Then the server is busy + Then the server is idle + Then all prompts are predicted with tokens + Examples: + | streaming | n_predict | + | disabled | 128 | + | enabled | 64 | + + + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + And 128 max tokens to predict + Given concurrent completion requests + Then the server is busy + Then the server is idle + Then all prompts are predicted diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature new file mode 100644 index 000000000..6a5a84e6a --- /dev/null +++ b/examples/server/tests/features/passkey.feature @@ -0,0 +1,54 @@ +# run with: ./tests.sh --no-skipped --tags passkey +@passkey +@slow +Feature: Passkey / Self-extend with context shift + + Background: Server startup + Given a server listening on localhost:8080 + + # Generates a long text of junk and inserts a secret passkey number inside it. + # Then we query the LLM for the secret passkey. + # see #3856 and #4810 + Scenario Outline: Passkey + Given a model file from HF repo + And as batch size + And as number of junk + And server max tokens to predict + And 42 as seed + And KV cache size + And 1 slots + And group attention factor to extend context size through self-extend + And group attention width to extend context size through self-extend + # Can be override with N_GPU_LAYERS + And GPU offloaded layers + Then the server is starting + Then the server is healthy + Given available models + Then model 0 is trained on tokens context + Given a prefix prompt: + """ + here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. + """ + And a passkey prompt template: + """ + The pass key is Remember it. is the pass key. + """ + And a junk suffix prompt: + """ + The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. + """ + And a suffix prompt: + """ + What is the pass key? The pass key is + """ + Given a "" passkey challenge prompt with the passkey inserted every junk + And a completion request with no api error + Then tokens are predicted matching + + Examples: + | hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content | + | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 | + | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b | + #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 | + #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0 + # 987 | diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature new file mode 100644 index 000000000..e8e1b5414 --- /dev/null +++ b/examples/server/tests/features/results.feature @@ -0,0 +1,118 @@ +@llama.cpp +@results +Feature: Results + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models + And a model file test-model-00001-of-00003.gguf + And 128 as batch size + And 1024 KV cache size + And 128 max tokens to predict + And continuous batching + + Scenario Outline: consistent results with same seed + Given slots + And 1.0 temperature + Then the server is starting + Then the server is healthy + + Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42 + + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all predictions are equal + Examples: + | n_slots | + | 1 | + # FIXME: unified KV cache nondeterminism + # | 2 | + + Scenario Outline: different results with different seed + Given slots + And 1.0 temperature + Then the server is starting + Then the server is healthy + + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42 + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43 + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44 + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45 + + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all predictions are different + Examples: + | n_slots | + | 1 | + | 2 | + + Scenario Outline: consistent results with same seed and varying batch size + Given 4 slots + And temperature + # And 0 as draft + Then the server is starting + Then the server is healthy + + Given 1 prompts "Write a very long story about AI." with seed 42 + And concurrent completion requests + # Then the server is busy # Not all slots will be utilized. + Then the server is idle + And all slots are idle + + Given prompts "Write a very long story about AI." with seed 42 + And concurrent completion requests + # Then the server is busy # Not all slots will be utilized. + Then the server is idle + And all slots are idle + + Then all predictions are equal + Examples: + | n_parallel | temp | + | 1 | 0.0 | + | 1 | 1.0 | + # FIXME: unified KV cache nondeterminism + # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227 + # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 + # and https://github.com/ggerganov/llama.cpp/pull/7347 . + # | 2 | 0.0 | + # | 4 | 0.0 | + # | 2 | 1.0 | + # | 4 | 1.0 | + + Scenario Outline: consistent token probs with same seed and prompt + Given slots + And KV cache size + And 1.0 temperature + And max tokens to predict + Then the server is starting + Then the server is healthy + + Given 1 prompts "The meaning of life is" with seed 42 + And concurrent completion requests + # Then the server is busy # Not all slots will be utilized. + Then the server is idle + And all slots are idle + + Given prompts "The meaning of life is" with seed 42 + And concurrent completion requests + # Then the server is busy # Not all slots will be utilized. + Then the server is idle + And all slots are idle + + Then all token probabilities are equal + Examples: + | n_slots | n_kv | n_predict | n_parallel | + | 4 | 1024 | 1 | 1 | + # FIXME: unified KV cache nondeterminism + # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227 + # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 + # and https://github.com/ggerganov/llama.cpp/pull/7347 . + # | 4 | 1024 | 1 | 4 | + # | 4 | 1024 | 100 | 1 | + # This test still fails even the above patches; the first token probabilities are already different. + # | 4 | 1024 | 100 | 4 | diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature new file mode 100644 index 000000000..eb82e7aca --- /dev/null +++ b/examples/server/tests/features/security.feature @@ -0,0 +1,68 @@ +@llama.cpp +@security +Feature: Security + + Background: Server startup with an api key defined + Given a server listening on localhost:8080 + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a server api key llama.cpp + Then the server is starting + Then the server is healthy + + Scenario Outline: Completion with some user api key + Given a prompt test + And a user api key + And 4 max tokens to predict + And a completion request with api error + + Examples: Prompts + | api_key | api_error | + | llama.cpp | no | + | llama.cpp | no | + | hackeme | raised | + | | raised | + + Scenario Outline: OAI Compatibility + Given a system prompt test + And a user prompt test + And a model test + And 2 max tokens to predict + And streaming is disabled + And a user api key + Given an OAI compatible chat completions request with api error + + Examples: Prompts + | api_key | api_error | + | llama.cpp | no | + | llama.cpp | no | + | hackme | raised | + + Scenario Outline: OAI Compatibility (invalid response formats) + Given a system prompt test + And a user prompt test + And a response format + And a model test + And 2 max tokens to predict + And streaming is disabled + Given an OAI compatible chat completions request with raised api error + + Examples: Prompts + | response_format | + | {"type": "sound"} | + | {"type": "json_object", "schema": 123} | + | {"type": "json_object", "schema": {"type": 123}} | + | {"type": "json_object", "schema": {"type": "hiccup"}} | + + + Scenario Outline: CORS Options + Given a user api key llama.cpp + When an OPTIONS request is sent from + Then CORS header is set to + + Examples: Headers + | origin | cors_header | cors_header_value | + | localhost | Access-Control-Allow-Origin | localhost | + | web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr | + | origin | Access-Control-Allow-Credentials | true | + | web.mydomain.fr | Access-Control-Allow-Methods | POST | + | web.mydomain.fr | Access-Control-Allow-Headers | * | diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature new file mode 100644 index 000000000..b55971454 --- /dev/null +++ b/examples/server/tests/features/server.feature @@ -0,0 +1,112 @@ +@llama.cpp +@server +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model file test-model.gguf + And a model alias tinyllama-2 + And BOS token is 1 + And 42 as server seed + # KV Cache corresponds to the total amount of tokens + # that can be stored across all independent sequences: #4130 + # see --ctx-size and #5568 + And 256 KV cache size + And 32 as batch size + And 2 slots + And 64 server max tokens to predict + And prometheus compatible metrics exposed + Then the server is starting + Then the server is healthy + + Scenario: Health + Then the server is ready + And all slots are idle + + + Scenario Outline: Completion + Given a prompt + And max tokens to predict + And a completion request with no api error + Then tokens are predicted matching + And the completion is truncated + And prompt tokens are processed + And prometheus metrics are exposed + And metric llamacpp:tokens_predicted is + + Examples: Prompts + | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | + | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | + | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not | + + Scenario: Completion prompt truncated + Given a prompt: + """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. + Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + """ + And a completion request with no api error + Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl + And the completion is truncated + And 109 prompt tokens are processed + + + Scenario Outline: OAI Compatibility + Given a model + And a system prompt + And a user prompt + And max tokens to predict + And streaming is + Given an OAI compatible chat completions request with no api error + Then tokens are predicted matching + And prompt tokens are processed + And the completion is truncated + + Examples: Prompts + | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | + | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | | + + + Scenario Outline: OAI Compatibility w/ response format + Given a model test + And a system prompt test + And a user prompt test + And a response format + And 10 max tokens to predict + Given an OAI compatible chat completions request with no api error + Then tokens are predicted matching + + Examples: Prompts + | response_format | n_predicted | re_content | + | {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" | + | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] | + | {"type": "json_object"} | 10 | \{ " Jacky. | + + + Scenario: Tokenize / Detokenize + When tokenizing: + """ + What is the capital of France ? + """ + Then tokens can be detokenized + And tokens do not begin with BOS + + Scenario: Tokenize w/ BOS + Given adding special tokens + When tokenizing: + """ + What is the capital of Germany? + """ + Then tokens begin with BOS + Given first token is removed + Then tokens can be detokenized + + Scenario: Models available + Given available models + Then 1 models are supported + Then model 0 is identified by tinyllama-2 + Then model 0 is trained on 128 tokens context diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature new file mode 100644 index 000000000..1c281c074 --- /dev/null +++ b/examples/server/tests/features/slotsave.feature @@ -0,0 +1,58 @@ +@llama.cpp +@slotsave +Feature: llama.cpp server slot management + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And prompt caching is enabled + And 2 slots + And . as slot save path + And 2048 KV cache size + And 42 as server seed + And 24 max tokens to predict + Then the server is starting + Then the server is healthy + + Scenario: Save and Restore Slot + # First prompt in slot 1 should be fully processed + Given a user prompt "What is the capital of France?" + And using slot id 1 + And a completion request with no api error + Then 24 tokens are predicted matching (Lily|cake) + And 22 prompt tokens are processed + When the slot 1 is saved with filename "slot1.bin" + Then the server responds with status code 200 + # Since we have cache, this should only process the last tokens + Given a user prompt "What is the capital of Germany?" + And a completion request with no api error + Then 24 tokens are predicted matching (Thank|special) + And 7 prompt tokens are processed + # Loading the original cache into slot 0, + # we should only be processing 1 prompt token and get the same output + When the slot 0 is restored with filename "slot1.bin" + Then the server responds with status code 200 + Given a user prompt "What is the capital of France?" + And using slot id 0 + And a completion request with no api error + Then 24 tokens are predicted matching (Lily|cake) + And 1 prompt tokens are processed + # For verification that slot 1 was not corrupted during slot 0 load, same thing + Given a user prompt "What is the capital of Germany?" + And using slot id 1 + And a completion request with no api error + Then 24 tokens are predicted matching (Thank|special) + And 1 prompt tokens are processed + + Scenario: Erase Slot + Given a user prompt "What is the capital of France?" + And using slot id 1 + And a completion request with no api error + Then 24 tokens are predicted matching (Lily|cake) + And 22 prompt tokens are processed + When the slot 1 is erased + Then the server responds with status code 200 + Given a user prompt "What is the capital of France?" + And a completion request with no api error + Then 24 tokens are predicted matching (Lily|cake) + And 22 prompt tokens are processed diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py new file mode 100644 index 000000000..df0814cc9 --- /dev/null +++ b/examples/server/tests/features/steps/steps.py @@ -0,0 +1,1360 @@ +import asyncio +import json +import os +import re +import socket +import subprocess +import sys +import threading +import time +from collections.abc import Sequence +from contextlib import closing +from re import RegexFlag +from typing import Any, Literal, cast + +import aiohttp +import numpy as np +import openai +from openai.types.chat import ChatCompletionChunk +from behave import step # pyright: ignore[reportAttributeAccessIssue] +from behave.api.async_step import async_run_until_complete +from prometheus_client import parser + +# pyright: reportRedeclaration=false + +@step("a server listening on {server_fqdn}:{server_port}") +def step_server_config(context, server_fqdn: str, server_port: str): + context.server_fqdn = server_fqdn + context.server_port = int(server_port) + context.n_threads = None + context.n_gpu_layer = None + if 'PORT' in os.environ: + context.server_port = int(os.environ['PORT']) + print(f"$PORT set, overriding server port with to {context.server_port}") + if 'FQDN' in os.environ: + context.server_fqdn = os.environ['FQDN'] + print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}") + if 'N_GPU_LAYERS' in os.environ: + context.n_gpu_layer = int(os.environ['N_GPU_LAYERS']) + print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}") + + context.base_url = f'http://{context.server_fqdn}:{context.server_port}' + + context.model_alias = None + context.model_file = None + context.model_hf_repo = None + context.model_hf_file = None + context.model_url = None + context.n_batch = None + context.n_ubatch = None + context.n_ctx = None + context.n_ga = None + context.n_ga_w = None + context.n_predict = None + context.n_prompts = 0 + context.n_server_predict = None + context.slot_save_path = None + context.id_slot = None + context.cache_prompt = None + context.n_slots = None + context.prompt_prefix = None + context.prompt_suffix = None + context.server_api_key = None + context.server_continuous_batching = False + context.server_embeddings = False + context.server_metrics = False + context.server_process = None + context.seed = None + context.draft = None + context.server_seed = None + context.user_api_key = None + context.response_format = None + context.temperature = None + + context.tasks_result = [] + context.concurrent_tasks = [] + context.prompts = [] + + +@step('a model file {hf_file} from HF repo {hf_repo}') +def step_download_hf_model(context, hf_file: str, hf_repo: str): + context.model_hf_repo = hf_repo + context.model_hf_file = hf_file + context.model_file = os.path.basename(hf_file) + + +@step('a model file {model_file}') +def step_model_file(context, model_file: str): + context.model_file = model_file + + +@step('a model url {model_url}') +def step_model_url(context, model_url: str): + context.model_url = model_url + + +@step('a model alias {model_alias}') +def step_model_alias(context, model_alias: str): + context.model_alias = model_alias + + +@step('{seed:d} as server seed') +def step_seed(context, seed: int): + context.server_seed = seed + + +@step('{ngl:d} GPU offloaded layers') +def step_n_gpu_layer(context, ngl: int): + if 'N_GPU_LAYERS' in os.environ: + new_ngl = int(os.environ['N_GPU_LAYERS']) + if context.debug: + print(f"-ngl upgraded from {ngl} to {new_ngl}") + ngl = new_ngl + context.n_gpu_layer = ngl + + +@step('{n_threads:d} threads') +def step_n_threads(context, n_threads: int): + context.n_thread = n_threads + + +@step('{draft:d} as draft') +def step_draft(context, draft: int): + context.draft = draft + + +@step('{n_ctx:d} KV cache size') +def step_n_ctx(context, n_ctx: int): + context.n_ctx = n_ctx + + +@step('{n_slots:d} slots') +def step_n_slots(context, n_slots: int): + context.n_slots = n_slots + + +@step('{n_predict:d} server max tokens to predict') +def step_server_n_predict(context, n_predict: int): + context.n_server_predict = n_predict + + +@step('{slot_save_path} as slot save path') +def step_slot_save_path(context, slot_save_path: str): + context.slot_save_path = slot_save_path + + +@step('using slot id {id_slot:d}') +def step_id_slot(context, id_slot: int): + context.id_slot = id_slot + + +@step('prompt caching is enabled') +def step_enable_prompt_cache(context): + context.cache_prompt = True + + +@step('continuous batching') +def step_server_continuous_batching(context): + context.server_continuous_batching = True + + +@step('embeddings extraction') +def step_server_embeddings(context): + context.server_embeddings = True + + +@step('prometheus compatible metrics exposed') +def step_server_metrics(context): + context.server_metrics = True + + +@step("the server is starting") +def step_start_server(context): + start_server_background(context) + attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + + addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM) + family, typ, proto, _, sockaddr = addrs[0] + + while True: + with closing(socket.socket(family, typ, proto)) as sock: + result = sock.connect_ex(sockaddr) + if result == 0: + print("\x1b[33;46mserver started!\x1b[0m") + return + attempts += 1 + if attempts > max_attempts: + assert False, "server not started" + print(f"waiting for server to start, connect error code = {result}...") + time.sleep(0.1) + + +@step("the server is {expecting_status}") +@async_run_until_complete +async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): + match expecting_status: + case 'healthy': + await wait_for_health_status(context, context.base_url, 200, 'ok', + timeout=30) + + case 'ready' | 'idle': + await wait_for_health_status(context, context.base_url, 200, 'ok', + timeout=30, + params={'fail_on_no_slot': 0, 'include_slots': 0}, + slots_idle=context.n_slots, + slots_processing=0, + expected_slots=[{'id': slot_id, 'state': 0} + for slot_id in + range(context.n_slots if context.n_slots else 1)]) + case 'busy': + await wait_for_health_status(context, context.base_url, 503, + 'no slot available', + params={'fail_on_no_slot': 0, 'include_slots': 0}, + slots_idle=0, + slots_processing=context.n_slots, + expected_slots=[{'id': slot_id, 'state': 1} + for slot_id in + range(context.n_slots if context.n_slots else 1)]) + case _: + assert False, "unknown status" + + +@step('all slots are {expected_slot_status_string}') +@async_run_until_complete +async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): + match expected_slot_status_string: + case 'idle': + expected_slot_status = 0 + case 'busy': + expected_slot_status = 1 + case _: + assert False, "unknown status" + + expected_slots = [{'id': slot_id, 'state': expected_slot_status} + for slot_id in range(context.n_slots)] + await request_slots_status(context, expected_slots) + + +@step('a completion request with {api_error} api error') +@async_run_until_complete +async def step_request_completion(context, api_error: Literal['raised'] | str): + expect_api_error = api_error == 'raised' + seeds = await completions_seed(context, num_seeds=1) + completion = await request_completion(context.prompts.pop(), + seeds[0] if seeds is not None else seeds, + context.base_url, + debug=context.debug, + n_predict=context.n_predict, + cache_prompt=context.cache_prompt, + id_slot=context.id_slot, + expect_api_error=expect_api_error, + user_api_key=context.user_api_key, + temperature=context.temperature) + context.tasks_result.append(completion) + if context.debug: + print(f"Completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + +@step('{predicted_n:d} tokens are predicted matching {re_content}') +def step_n_tokens_predicted_with_content(context, predicted_n, re_content): + context.completion = context.tasks_result.pop() + assert_n_tokens_predicted(context.completion, predicted_n, re_content) + + +@step('{predicted_n:d} tokens are predicted') +def step_n_tokens_predicted(context, predicted_n): + context.completion = context.tasks_result.pop() + assert_n_tokens_predicted(context.completion, predicted_n) + + +@step('all predictions are equal') +@async_run_until_complete +async def step_predictions_equal(context): + n_completions = await gather_tasks_results(context) + assert n_completions >= 2, "need at least 2 completions" + assert_all_predictions_equal(context.tasks_result) + context.tasks_result = [] + + +@step('all predictions are different') +@async_run_until_complete +async def step_predictions_different(context): + n_completions = await gather_tasks_results(context) + assert n_completions >= 2, "need at least 2 completions" + assert_all_predictions_different(context.tasks_result) + context.tasks_result = [] + + +@step('all token probabilities are equal') +@async_run_until_complete +async def step_token_probabilities_equal(context): + n_completions = await gather_tasks_results(context) + assert n_completions >= 2, "need at least 2 completions" + assert_all_token_probabilities_equal(context.tasks_result) + context.tasks_result = [] + + +@step('the completion is truncated') +def step_assert_completion_truncated(context): + step_assert_completion_truncated(context, '') + + +@step('the completion is {truncated} truncated') +def step_assert_completion_truncated(context, truncated): + truncated = truncated != "not" + assert context.completion['truncated'] == truncated, f'{context.completion}' + + +@step('{n_prompt:d} prompt tokens are processed') +def step_impl(context, n_prompt): + assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}" + + +@step('a user prompt {user_prompt}') +def step_user_prompt(context, user_prompt): + context.prompts.append(user_prompt) + context.n_prompts = len(context.prompts) + + +@step('a system prompt {system_prompt}') +def step_system_prompt(context, system_prompt): + context.system_prompt = system_prompt + + +@step('a model {model}') +def step_model(context, model): + context.model = model + + +@step('{max_tokens:d} max tokens to predict') +def step_max_tokens(context, max_tokens): + context.n_predict = max_tokens + + +@step('a response format {response_format}') +def step_response_format(context, response_format): + context.response_format = json.loads(response_format) + + +@step('{temperature:f} temperature') +def step_temperature(context, temperature): + context.temperature = temperature + + +@step('streaming is {enable_streaming}') +def step_streaming(context, enable_streaming): + context.enable_streaming = enable_streaming == 'enabled' + + +@step('a user api key {user_api_key}') +def step_user_api_key(context, user_api_key): + context.user_api_key = user_api_key + + +@step('no user api key') +def step_no_user_api_key(context): + context.user_api_key = None + + +@step('a user api key ') +def step_no_user_api_key_space(context): + context.user_api_key = None + + +@step('a server api key {server_api_key}') +def step_server_api_key(context, server_api_key): + context.server_api_key = server_api_key + + +@step('{n_junk:d} as number of junk') +def step_n_junk(context, n_junk): + context.n_junk = n_junk + + +@step('{n_batch:d} as batch size') +def step_n_batch(context, n_batch): + context.n_batch = n_batch + + +@step('{n_ubatch:d} as ubatch size') +def step_n_ubatch(context, n_ubatch): + context.n_ubatch = n_ubatch + + +@step('{seed:d} as seed') +def step_seed(context, seed): + if context.seed is None: + context.seed = [seed] + else: + context.seed.append(seed) + + +@step('BOS token is {bos:d}') +def step_bos_token(context, bos): + context.bos = bos + + +@step('a prefix prompt') +def step_prompt_prefix(context): + context.prompt_prefix = context_text(context) + + +@step('a junk suffix prompt') +def step_prompt_junk_suffix(context): + context.prompt_junk_suffix = context_text(context) + + +@step('a suffix prompt') +def step_prompt_suffix(context): + context.prompt_suffix = context_text(context) + + +@step('{n_ga:d} group attention factor' + ' to extend context size through self-extend') +def step_impl(context, n_ga): + context.n_ga = n_ga + + +@step('{n_ga_w:d} group attention width to extend context size through self-extend') +def step_impl(context, n_ga_w): + context.n_ga_w = n_ga_w + + +@step('a passkey prompt template') +def step_prompt_passkey(context): + context.prompt_passkey = context_text(context) + + +@step('{n_prompts:d} fixed prompts') +def step_fixed_prompts(context, n_prompts): + context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)]) + context.n_prompts = n_prompts + + +@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') +def step_prompt_passkey(context, passkey, i_pos): + prompt = "" + for i in range(context.n_junk): + if i % context.n_junk == i_pos: + prompt += context.prompt_passkey # the passkey is already substituted + prompt += context.prompt_junk_suffix + if context.debug: + passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m" + print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```") + context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix) + context.n_prompts = len(context.prompts) + + +@step('an OAI compatible chat completions request with {api_error} api error') +@async_run_until_complete +async def step_oai_chat_completions(context, api_error): + if context.debug: + print(f"Submitting OAI compatible completions request...") + expect_api_error = api_error == 'raised' + seeds = await completions_seed(context, num_seeds=1), + completion = await oai_chat_completions(context.prompts.pop(), + seeds[0] if seeds is not None else seeds, + context.system_prompt, + context.base_url, + '/v1/chat', + False, + model=context.model if hasattr(context, 'model') else None, + + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + + response_format=context.response_format + if hasattr(context, 'response_format') else None, + + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None, + + expect_api_error=expect_api_error) + context.tasks_result.append(completion) + if context.debug: + print(f"Completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + if context.debug: + print(f"Completion response: {completion}") + + +@step('a prompt') +def step_a_prompt(context): + context.prompts.append(context_text(context)) + context.n_prompts = len(context.prompts) + + +@step('a prompt {prompt}') +def step_a_prompt_prompt(context, prompt): + context.prompts.append(prompt) + context.n_prompts = len(context.prompts) + + +@step('{num_prompts:d} prompts {prompt} with seed {seed:d}') +def step_many_prompts(context, num_prompts, prompt, seed): + if context.seed is None: + context.seed = [] + for _ in range(num_prompts): + context.seed.append(seed) + context.prompts.append(prompt) + context.n_prompts = len(context.prompts) + + +@step('concurrent completion requests') +@async_run_until_complete() +async def step_concurrent_completion_requests(context): + await concurrent_requests( + context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + prompt_prefix=context.prompt_prefix, + prompt_suffix=context.prompt_suffix, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None, + temperature=context.temperature, + ) + + +@step('concurrent OAI completions requests') +@async_run_until_complete +async def step_oai_chat_completions(context): + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + '/v1/chat/completions', + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + response_format=context.response_format + if hasattr(context, 'response_format') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) + + +@step('concurrent OAI completions requests no v1') +@async_run_until_complete +async def step_oai_chat_completions(context): + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + '/chat/completions', + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + response_format=context.response_format + if hasattr(context, 'response_format') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) + + +@step('all prompts are predicted') +@async_run_until_complete +async def step_all_prompts_are_predicted(context): + await all_prompts_are_predicted(context) + + +@step('all prompts are predicted with {n_expected_predicted:d} tokens') +@async_run_until_complete +async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted): + await all_prompts_are_predicted(context, n_expected_predicted) + + +async def all_prompts_are_predicted(context, expected_predicted_n=None): + n_completions = await gather_tasks_results(context) + assert n_completions > 0 + for i in range(n_completions): + assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n) + assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" + + +@step('embeddings are computed for') +@async_run_until_complete +async def step_compute_embedding(context): + context.n_prompts = 1 + context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url) + + +@step('all embeddings are the same') +@async_run_until_complete +async def step_all_embeddings_are_the_same(context): + n_embedding_requests = await gather_tasks_results(context) + assert n_embedding_requests > 0 + embeddings = [] + for i in range(n_embedding_requests): + embedding = context.tasks_result.pop().pop() + embeddings.append(embedding) + assert_embeddings(embedding) + n = len(embeddings) + for i in range(n-1): + for j in range(i+1, n): + embedding1 = np.array(embeddings[i]) + embedding2 = np.array(embeddings[j]) + if context.debug: + print(f"embedding1: {embedding1[-8:]}") + print(f"embedding2: {embedding2[-8:]}") + similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) + msg = f"Similarity between {i} and {j}: {similarity:.10f}" + if context.debug: + print(f"{msg}") + assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg + + +@step('embeddings are generated') +def step_assert_embeddings(context): + assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n" + f"context.n_prompts={context.n_prompts}\n" + f"context.embeddings={context.embeddings}") + for embedding in context.embeddings: + assert_embeddings(embedding) + + +@step('an OAI compatible embeddings computation request for') +@async_run_until_complete +async def step_oai_compute_embeddings(context): + context.n_prompts = 1 + context.embeddings = await request_oai_embeddings(context_text(context), None, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) + + +@step('an OAI compatible embeddings computation request for multiple inputs') +@async_run_until_complete +async def step_oai_compute_embeddings_multiple_inputs(context): + context.embeddings = await request_oai_embeddings(context.prompts, None, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) + context.prompts.clear() + + +@step('concurrent embedding requests') +@async_run_until_complete() +async def step_concurrent_embedding_requests(context): + await concurrent_requests(context, + request_embedding, + # prompt is inserted automatically + base_url=context.base_url) + + +@step('concurrent OAI embedding requests') +@async_run_until_complete() +async def step_concurrent_oai_embedding_requests(context): + await concurrent_requests(context, + request_oai_embeddings, + # prompt is inserted automatically + base_url=context.base_url, + async_client=True, + model=context.model) + + +@step('all embeddings are generated') +@async_run_until_complete() +async def all_embeddings_are_generated(context): + n_embedding_requests = await gather_tasks_results(context) + assert n_embedding_requests == context.n_prompts + for i in range(n_embedding_requests): + assert_embeddings(context.tasks_result.pop().pop()) + + +@step('adding special tokens') +def step_tokenize_set_add_special(context): + context.tokenize_add_special = True + + +@step('tokenizing') +@async_run_until_complete +async def step_tokenize(context): + context.tokenized_text = context_text(context) + async with aiohttp.ClientSession() as session: + tokenize_args = { + "content": context.tokenized_text, + } + if getattr(context, 'tokenize_add_special', None) is not None: + tokenize_args['add_special'] = context.tokenize_add_special + async with session.post(f'{context.base_url}/tokenize', + json=tokenize_args) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens = tokenize_json['tokens'] + + +@step('tokens can be detokenized') +@async_run_until_complete +async def step_detokenize(context): + assert len(context.tokens) > 0 + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/detokenize', + json={ + "tokens": context.tokens, + }) as response: + assert response.status == 200 + detokenize_json = await response.json() + # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15 + assert context.tokenized_text == detokenize_json['content'].strip() + + +@step('tokens begin with BOS') +def step_strings_for_tokenization(context): + assert context.tokens[0] == context.bos + + +@step('tokens do not begin with BOS') +def step_strings_for_tokenization(context): + assert context.tokens[0] != context.bos + + +@step('first token is removed') +def step_strings_for_tokenization(context): + context.tokens = context.tokens[1:] + + +@step('an OPTIONS request is sent from {origin}') +@async_run_until_complete +async def step_options_request(context, origin): + async with aiohttp.ClientSession() as session: + headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin} + async with session.options(f'{context.base_url}/v1/chat/completions', + headers=headers) as response: + assert response.status == 200 + context.options_response = response + + +@step('CORS header {cors_header} is set to {cors_header_value}') +def step_check_options_header_value(context, cors_header, cors_header_value): + assert context.options_response.headers[cors_header] == cors_header_value + + +@step('prometheus metrics are exposed') +@async_run_until_complete +async def step_prometheus_metrics_exported(context): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/metrics') as metrics_response: + assert metrics_response.status == 200 + assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" + metrics_raw = await metrics_response.text() + metric_exported = False + if context.debug: + print(f"/metrics answer:\n{metrics_raw}") + context.metrics = {} + for metric in parser.text_string_to_metric_families(metrics_raw): + match metric.name: + case "llamacpp:kv_cache_usage_ratio": + assert len(metric.samples) > 0 + metric_exported = True + context.metrics[metric.name] = metric + assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time" + assert metric_exported, "No metrics exported" + + +@step('metric {metric_name} is {metric_value:d}') +def step_assert_metric_value(context, metric_name, metric_value): + if metric_name not in context.metrics: + assert False, f"no metric {metric_name} in {context.metrics.keys()}" + assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}" + + +@step('available models') +def step_available_models(context): + # openai client always expects an api_key + openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' + openai.base_url = f'{context.base_url}/v1/' + context.models = openai.models.list().data + + +@step('{n_model:d} models are supported') +def step_supported_models(context, n_model): + if context.debug: + print("server models available:", context.models) + assert len(context.models) == n_model + + +@step('model {i_model:d} is {param} {preposition} {param_value}') +def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str): + assert i_model < len(context.models) + model = context.models[i_model] + + param_value = param_value.split(' ', 1)[0] + match param: + case 'identified': + value = model.id + case 'trained': + value = str(model.meta["n_ctx_train"]) + case _: + assert False, "param {param} not supported" + assert param_value == value, f"model param {param} {value} != {param_value}" + + +async def concurrent_requests(context, f_completion, *args, **kwargs): + context.n_prompts = len(context.prompts) + if context.debug: + print(f"starting {context.n_prompts} concurrent completion requests...") + assert context.n_prompts > 0 + seeds = await completions_seed(context) + assert seeds is not None + for prompt_no in range(context.n_prompts): + shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] + context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) + await asyncio.sleep(0.1) + + +@step('the slot {slot_id:d} is saved with filename "{filename}"') +@async_run_until_complete +async def step_save_slot(context, slot_id, filename): + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/slots/{slot_id}?action=save', + json={"filename": filename}, + headers={"Content-Type": "application/json"}) as response: + context.response = response + + +@step('the slot {slot_id:d} is restored with filename "{filename}"') +@async_run_until_complete +async def step_restore_slot(context, slot_id, filename): + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore', + json={"filename": filename}, + headers={"Content-Type": "application/json"}) as response: + context.response = response + + +@step('the slot {slot_id:d} is erased') +@async_run_until_complete +async def step_erase_slot(context, slot_id): + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase', + headers={"Content-Type": "application/json"}) as response: + context.response = response + + +@step('the server responds with status code {status_code:d}') +def step_server_responds_with_status_code(context, status_code): + assert context.response.status == status_code + + +async def request_completion(prompt, + seed, + base_url, + debug=False, + prompt_prefix=None, + prompt_suffix=None, + n_predict=None, + cache_prompt=False, + id_slot=None, + expect_api_error=None, + user_api_key=None, + temperature=None) -> int | dict[str, Any]: + if debug: + print(f"Sending completion request: {prompt}") + origin = "my.super.domain" + headers = { + 'Origin': origin + } + if user_api_key is not None: + if debug: + print(f"Set user_api_key: {user_api_key}") + headers['Authorization'] = f'Bearer {user_api_key}' + + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/completion', + json={ + "input_prefix": prompt_prefix, + "prompt": prompt, + "input_suffix": prompt_suffix, + "n_predict": n_predict if n_predict is not None else -1, + "cache_prompt": cache_prompt, + "id_slot": id_slot, + "seed": seed if seed is not None else 42, + "temperature": temperature if temperature is not None else 0.8, + "n_probs": 2, + }, + headers=headers, + timeout=3600) as response: + if expect_api_error is None or not expect_api_error: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + return await response.json() + else: + return response.status + + +async def oai_chat_completions(user_prompt, + seed, + system_prompt, + base_url: str, + base_path: str, + async_client, + debug=False, + temperature=None, + model=None, + n_predict=None, + enable_streaming=None, + response_format=None, + user_api_key=None, + expect_api_error=None) -> int | dict[str, Any]: + if debug: + print(f"Sending OAI Chat completions request: {user_prompt}") + # openai client always expects an api key + user_api_key = user_api_key if user_api_key is not None else 'nope' + seed = seed if seed is not None else 42 + enable_streaming = enable_streaming if enable_streaming is not None else False + payload = { + "messages": [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": user_prompt, + } + ], + "model": model, + "max_tokens": n_predict, + "stream": enable_streaming, + "temperature": temperature if temperature is not None else 0.0, + "seed": seed, + } + if response_format is not None: + payload['response_format'] = response_format + completion_response = { + 'content': '', + 'timings': { + 'predicted_n': 0, + 'prompt_n': 0 + } + } + if async_client: + origin = 'llama.cpp' + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}{base_path}', + json=payload, + headers=headers) as response: + if enable_streaming: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "text/event-stream" + event_received = True + while event_received: + event_received = False + async for line_in_bytes in response.content: + line = line_in_bytes.decode('utf-8') + line = line.rstrip('\n').rstrip('\r') + if line == '': + continue + event_data = line.split(': ', 1) + assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' + chunk_raw = event_data[1] + + chunk = json.loads(chunk_raw) + assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" + delta = chunk['choices'][0]['delta'] + if 'content' in delta: + completion_response['content'] += delta['content'] + completion_response['timings']['predicted_n'] += 1 + else: + if expect_api_error is None or not expect_api_error: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + chat_completion_raw = await response.json() + completion_response = { + 'content': chat_completion_raw['choices'][0]['message'], + 'timings': { + 'predicted_n': chat_completion_raw['usage']['completion_tokens'], + 'prompt_n': chat_completion_raw['usage']['prompt_tokens'] + } + } + else: + return response.status + else: + try: + openai.api_key = user_api_key + openai.base_url = f'{base_url}{base_path.removesuffix("chat")}' + assert model is not None + chat_completion = openai.chat.completions.create( + messages=payload['messages'], + model=model, + max_tokens=n_predict, + stream=enable_streaming, + response_format=payload.get('response_format') or openai.NOT_GIVEN, + seed=seed, + temperature=payload['temperature'] + ) + except openai.AuthenticationError as e: + if expect_api_error is not None and expect_api_error: + return 401 + else: + assert False, f'error raised: {e}' + + if enable_streaming: + chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion) + for chunk in chat_completion: + assert len(chunk.choices) == 1 + delta = chunk.choices[0].delta + if delta.content is not None: + completion_response['content'] += delta.content + completion_response['timings']['predicted_n'] += 1 + completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' + else: + assert len(chat_completion.choices) == 1 + assert chat_completion.usage is not None + completion_response = { + 'content': chat_completion.choices[0].message.content, + 'timings': { + 'predicted_n': chat_completion.usage.completion_tokens, + 'prompt_n': chat_completion.usage.prompt_tokens + }, + 'truncated': chat_completion.choices[0].finish_reason != 'stop' + } + if debug: + print("OAI response formatted to llama.cpp:", completion_response) + return completion_response + + +async def request_embedding(content, seed, base_url=None) -> list[list[float]]: + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/embedding', + json={ + "content": content, + }) as response: + assert response.status == 200 + response_json = await response.json() + return [response_json['embedding']] + + +async def request_oai_embeddings(input, seed, + base_url=None, user_api_key=None, + model=None, async_client=False) -> list[list[float]]: + # openai client always expects an api_key + user_api_key = user_api_key if user_api_key is not None else 'nope' + if async_client: + origin = 'llama.cpp' + headers=[] + if user_api_key is not None: + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/v1/embeddings', + json={ + "input": input, + "model": model, + }, + headers=headers, + timeout=3600) as response: + assert response.status == 200, f"received status code not expected: {response.status}" + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + response_json = await response.json() + assert response_json['model'] == model, f"invalid model received: {response_json['model']}" + assert response_json['object'] == 'list' + if isinstance(input, Sequence): + embeddings = [] + for an_oai_embeddings in response_json['data']: + embeddings.append(an_oai_embeddings['embedding']) + else: + embeddings = [response_json['data']['embedding']] + return embeddings + else: + openai.api_key = user_api_key + openai.base_url = f'{base_url}/v1/' + assert model is not None + oai_embeddings = openai.embeddings.create( + model=model, + input=input, + ) + + return [e.embedding for e in oai_embeddings.data] + + +def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): + content = completion_response['content'] + n_predicted = completion_response['timings']['predicted_n'] + assert len(content) > 0, "no token predicted" + if re_content is not None: + p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL) + matches = p.finditer(content) + last_match = 0 + highlighted = '' + for match in matches: + start, end = match.span() + highlighted += content[last_match: start] + highlighted += '\x1b[33m' + highlighted += content[start: end] + highlighted += '\x1b[0m' + last_match = end + highlighted += content[last_match:] + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + print(f"Checking completion response: {highlighted}") + assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```' + if expected_predicted_n and expected_predicted_n > 0: + assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' + f' {n_predicted} <> {expected_predicted_n}') + +def assert_all_predictions_equal(completion_responses): + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + print(f"content {i}: {content_i}") + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + for j, response_j in enumerate(completion_responses): + if i == j: + continue + content_j = response_j['content'] + assert content_i == content_j, "contents not equal" + + +def assert_all_predictions_different(completion_responses): + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + print(f"content {i}: {content_i}") + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + for j, response_j in enumerate(completion_responses): + if i == j: + continue + content_j = response_j['content'] + assert content_i != content_j, "contents not different" + + +def assert_all_token_probabilities_equal(completion_responses): + n_predict = len(completion_responses[0]['completion_probabilities']) + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + for pos in range(n_predict): + for i, response_i in enumerate(completion_responses): + probs_i = response_i['completion_probabilities'][pos]['probs'] + print(f"pos {pos}, probs {i}: {probs_i}") + for pos in range(n_predict): + for i, response_i in enumerate(completion_responses): + probs_i = response_i['completion_probabilities'][pos]['probs'] + for j, response_j in enumerate(completion_responses): + if i == j: + continue + probs_j = response_j['completion_probabilities'][pos]['probs'] + assert probs_i == probs_j, "contents not equal" + + +async def gather_tasks_results(context): + n_tasks = len(context.concurrent_tasks) + if context.debug: + print(f"Waiting for all {n_tasks} tasks results...") + for task_no in range(n_tasks): + context.tasks_result.append(await context.concurrent_tasks.pop()) + n_completions = len(context.tasks_result) + return n_completions + + +async def wait_for_health_status(context, + base_url, + expected_http_status_code, + expected_health_status, + timeout=3, + params=None, + slots_idle=None, + slots_processing=None, + expected_slots=None): + if context.debug: + print(f"Starting checking for health for expected_health_status={expected_health_status}") + interval = 0.5 + counter = 0 + if 'GITHUB_ACTIONS' in os.environ: + timeout *= 2 + + async with aiohttp.ClientSession() as session: + while True: + async with await session.get(f'{base_url}/health', params=params) as health_response: + status_code = health_response.status + health = await health_response.json() + if context.debug: + print(f"HEALTH - response for expected health status='{expected_health_status}' on " + f"'{base_url}/health'?{params} is {health}\n") + if (status_code == expected_http_status_code + and health['status'] == expected_health_status + and (slots_idle is None or health['slots_idle'] == slots_idle) + and (slots_processing is None or health['slots_processing'] == slots_processing)): + if expected_slots is not None: + assert_slots_status(health['slots'], expected_slots) + return + if (status_code == expected_http_status_code + and health['status'] == expected_health_status + and (slots_idle is None or health['slots_idle'] == slots_idle) + and (slots_processing is None or health['slots_processing'] == slots_processing)): + if expected_slots is not None: + assert_slots_status(health['slots'], expected_slots) + return + await asyncio.sleep(interval) + + counter += interval + if counter >= timeout: + # Sometimes health requests are triggered after completions are predicted + if expected_http_status_code == 503: + if len(context.tasks_result) == 0: + print("\x1b[5;37;43mWARNING: forcing concurrent tasks," + " busy health check missed, probably too fast inference\x1b[0m\n") + n_completions = await gather_tasks_results(context) + if n_completions > 0: + return + + assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}' + + +def assert_embeddings(embeddings): + assert len(embeddings) > 0 + embeddings_computed = False + for emb in embeddings: + if not isinstance(emb, float): + assert False, f"Bad embeddings: {embeddings}" + if emb != 0: + embeddings_computed = True + assert embeddings_computed, f"Embeddings: {embeddings}" + + +async def request_slots_status(context, expected_slots): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/slots') as slots_response: + assert slots_response.status == 200 + slots = await slots_response.json() + assert_slots_status(slots, expected_slots) + + +def assert_slots_status(slots, expected_slots): + assert len(slots) == len(expected_slots) + for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)): + for key in expected: + assert expected[key] == slot[key], (f"invalid slot {slot_id}" + f" expected[{key}] != slot[{key}]" + f" = {expected[key]} != {slot[key]}") + + +async def completions_seed(context, num_seeds=None): + if hasattr(context, "seed") and context.seed is not None: + assert len(context.seed) == context.n_prompts + if num_seeds is None: + num_seeds = context.n_prompts + assert num_seeds <= context.n_prompts + seeds = context.seed[:num_seeds] + context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None + return seeds + + if hasattr(context, "server_seed") and context.server_seed is not None: + if num_seeds is None: + return [context.server_seed] * context.n_prompts + else: + return [context.server_seed] * num_seeds + return None + + +def context_text(context): + return context.text.replace('\r', '') + + +def start_server_background(context): + if os.name == 'nt': + context.server_path = '../../../build/bin/Release/llama-server.exe' + else: + context.server_path = '../../../build/bin/llama-server' + if 'LLAMA_SERVER_BIN_PATH' in os.environ: + context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_listen_addr = context.server_fqdn + server_args = [ + '--host', server_listen_addr, + '--port', context.server_port, + ] + if context.model_file: + server_args.extend(['--model', context.model_file]) + if context.model_url: + server_args.extend(['--model-url', context.model_url]) + if context.model_hf_repo: + server_args.extend(['--hf-repo', context.model_hf_repo]) + if context.model_hf_file: + server_args.extend(['--hf-file', context.model_hf_file]) + if context.n_batch: + server_args.extend(['--batch-size', context.n_batch]) + if context.n_ubatch: + server_args.extend(['--ubatch-size', context.n_ubatch]) + if context.n_threads: + server_args.extend(['--threads', context.threads]) + if context.n_gpu_layer: + server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) + if context.draft is not None: + server_args.extend(['--draft', context.draft]) + if context.server_continuous_batching: + server_args.append('--cont-batching') + if context.server_embeddings: + server_args.append('--embedding') + if context.server_metrics: + server_args.append('--metrics') + if context.model_alias: + server_args.extend(['--alias', context.model_alias]) + if context.n_ctx: + server_args.extend(['--ctx-size', context.n_ctx]) + if context.n_slots: + server_args.extend(['--parallel', context.n_slots]) + if context.n_server_predict: + server_args.extend(['--n-predict', context.n_server_predict]) + if context.slot_save_path: + server_args.extend(['--slot-save-path', context.slot_save_path]) + if context.server_api_key: + server_args.extend(['--api-key', context.server_api_key]) + if context.n_ga: + server_args.extend(['--grp-attn-n', context.n_ga]) + if context.n_ga_w: + server_args.extend(['--grp-attn-w', context.n_ga_w]) + if context.debug: + server_args.append('--verbose') + if 'SERVER_LOG_FORMAT_JSON' not in os.environ: + server_args.extend(['--log-format', "text"]) + + args = [str(arg) for arg in [context.server_path, *server_args]] + print(f"bench: starting server with: {' '.join(args)}") + + flags = 0 + if 'nt' == os.name: + flags |= subprocess.DETACHED_PROCESS + flags |= subprocess.CREATE_NEW_PROCESS_GROUP + flags |= subprocess.CREATE_NO_WINDOW + + pkwargs = { + 'creationflags': flags, + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE + } + context.server_process = subprocess.Popen( + [str(arg) for arg in [context.server_path, *server_args]], + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] + + def server_log(in_stream, out_stream): + for line in iter(in_stream.readline, b''): + print(line.decode('utf-8'), end='', file=out_stream) + + thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout)) + thread_stdout.start() + + thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr)) + thread_stderr.start() + + print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature new file mode 100644 index 000000000..cf14b3b44 --- /dev/null +++ b/examples/server/tests/features/wrong_usages.feature @@ -0,0 +1,22 @@ +# run with: ./tests.sh --no-skipped --tags wrong_usage +@wrong_usage +Feature: Wrong usage of llama.cpp server + + #3969 The user must always set --n-predict option + # to cap the number of tokens any completion request can generate + # or pass n_predict/max_tokens in the request. + Scenario: Infinite loop + Given a server listening on localhost:8080 + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + # Uncomment below to fix the issue + #And 64 server max tokens to predict + Then the server is starting + Given a prompt: + """ + Go to: infinite loop + """ + # Uncomment below to fix the issue + #And 128 max tokens to predict + Given concurrent completion requests + Then the server is idle + Then all prompts are predicted diff --git a/examples/server/tests/pytest.ini b/examples/server/tests/pytest.ini deleted file mode 100644 index 6df308df7..000000000 --- a/examples/server/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -[pytest] -markers = - slow: marks tests as slow (deselect with '-m "not slow"') - serial diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 15d024914..2c741ea10 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,8 +1,6 @@ aiohttp~=3.9.3 -pytest~=8.3.3 -huggingface_hub~=0.23.2 +behave~=1.2.6 +huggingface_hub~=0.20.3 numpy~=1.26.4 -openai~=1.55.3 +openai~=1.30.3 prometheus-client~=0.20.0 -requests~=2.32.3 -wget~=3.2 diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 33fa8cc64..72a0fbad8 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -1,23 +1,11 @@ #!/bin/bash -# make sure we are in the right directory -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -cd $SCRIPT_DIR - set -eu -if [[ "${SLOW_TESTS:-0}" == 1 ]]; then - # Slow tests for tool calls need quite a few models ahead of time to avoid timing out. - python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py -fi - if [ $# -lt 1 ] then - if [[ "${SLOW_TESTS:-0}" == 1 ]]; then - pytest -v -x - else - pytest -v -x -m "not slow" - fi + # Start @llama.cpp scenario + behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp else - pytest "$@" + behave "$@" fi diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py deleted file mode 100644 index 1485de8ce..000000000 --- a/examples/server/tests/unit/test_basic.py +++ /dev/null @@ -1,96 +0,0 @@ -import pytest -import requests -from utils import * - -server = ServerPreset.tinyllama2() - - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - - -def test_server_start_simple(): - global server - server.start() - res = server.make_request("GET", "/health") - assert res.status_code == 200 - - -def test_server_props(): - global server - server.start() - res = server.make_request("GET", "/props") - assert res.status_code == 200 - assert ".gguf" in res.body["model_path"] - assert res.body["total_slots"] == server.n_slots - default_val = res.body["default_generation_settings"] - assert server.n_ctx is not None and server.n_slots is not None - assert default_val["n_ctx"] == server.n_ctx / server.n_slots - assert default_val["params"]["seed"] == server.seed - - -def test_server_models(): - global server - server.start() - res = server.make_request("GET", "/models") - assert res.status_code == 200 - assert len(res.body["data"]) == 1 - assert res.body["data"][0]["id"] == server.model_alias - - -def test_server_slots(): - global server - - # without slots endpoint enabled, this should return error - server.server_slots = False - server.start() - res = server.make_request("GET", "/slots") - assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED - assert "error" in res.body - server.stop() - - # with slots endpoint enabled, this should return slots info - server.server_slots = True - server.n_slots = 2 - server.start() - res = server.make_request("GET", "/slots") - assert res.status_code == 200 - assert len(res.body) == server.n_slots - assert server.n_ctx is not None and server.n_slots is not None - assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots - assert "params" in res.body[0] - assert res.body[0]["params"]["seed"] == server.seed - - -def test_load_split_model(): - global server - server.model_hf_repo = "ggml-org/models" - server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf" - server.model_alias = "tinyllama-split" - server.start() - res = server.make_request("POST", "/completion", data={ - "n_predict": 16, - "prompt": "Hello", - "temperature": 0.0, - }) - assert res.status_code == 200 - assert match_regex("(little|girl)+", res.body["content"]) - - -def test_no_webui(): - global server - # default: webui enabled - server.start() - url = f"http://{server.server_host}:{server.server_port}" - res = requests.get(url) - assert res.status_code == 200 - assert "" in res.text - server.stop() - - # with --no-webui - server.no_webui = True - server.start() - res = requests.get(url) - assert res.status_code == 404 diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py deleted file mode 100644 index f23d5cff4..000000000 --- a/examples/server/tests/unit/test_chat_completion.py +++ /dev/null @@ -1,267 +0,0 @@ -import pytest -from openai import OpenAI -from utils import * - -server: ServerProcess - -@pytest.fixture(autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - - -@pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template", - [ - (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None), - (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None), - (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None), - (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None), - (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'), - (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None), - ] -) -def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template): - global server - server.jinja = jinja - server.chat_template = chat_template - server.start() - res = server.make_request("POST", "/chat/completions", data={ - "model": model, - "max_tokens": max_tokens, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ], - }) - assert res.status_code == 200 - assert "cmpl" in res.body["id"] # make sure the completion id has the expected format - assert res.body["system_fingerprint"].startswith("b") - assert res.body["model"] == model if model is not None else server.model_alias - assert res.body["usage"]["prompt_tokens"] == n_prompt - assert res.body["usage"]["completion_tokens"] == n_predicted - choice = res.body["choices"][0] - assert "assistant" == choice["message"]["role"] - assert match_regex(re_content, choice["message"]["content"]) - assert choice["finish_reason"] == finish_reason - - -@pytest.mark.parametrize( - "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", - [ - ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), - ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), - ] -) -def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): - global server - server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL - server.start() - res = server.make_stream_request("POST", "/chat/completions", data={ - "max_tokens": max_tokens, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ], - "stream": True, - }) - content = "" - last_cmpl_id = None - for data in res: - choice = data["choices"][0] - assert data["system_fingerprint"].startswith("b") - assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future - if last_cmpl_id is None: - last_cmpl_id = data["id"] - assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream - if choice["finish_reason"] in ["stop", "length"]: - assert data["usage"]["prompt_tokens"] == n_prompt - assert data["usage"]["completion_tokens"] == n_predicted - assert "content" not in choice["delta"] - assert match_regex(re_content, content) - assert choice["finish_reason"] == finish_reason - else: - assert choice["finish_reason"] is None - content += choice["delta"]["content"] - - -def test_chat_completion_with_openai_library(): - global server - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.chat.completions.create( - model="gpt-3.5-turbo-instruct", - messages=[ - {"role": "system", "content": "Book"}, - {"role": "user", "content": "What is the best book"}, - ], - max_tokens=8, - seed=42, - temperature=0.8, - ) - assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b") - assert res.choices[0].finish_reason == "length" - assert res.choices[0].message.content is not None - assert match_regex("(Suddenly)+", res.choices[0].message.content) - - -def test_chat_template(): - global server - server.chat_template = "llama3" - server.debug = True # to get the "__verbose" object in the response - server.start() - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": 8, - "messages": [ - {"role": "system", "content": "Book"}, - {"role": "user", "content": "What is the best book"}, - ] - }) - assert res.status_code == 200 - assert "__verbose" in res.body - assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - -def test_apply_chat_template(): - global server - server.chat_template = "command-r" - server.start() - res = server.make_request("POST", "/apply-template", data={ - "messages": [ - {"role": "system", "content": "You are a test."}, - {"role": "user", "content":"Hi there"}, - ] - }) - assert res.status_code == 200 - assert "prompt" in res.body - assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" - - -@pytest.mark.parametrize("response_format,n_predicted,re_content", [ - ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), - ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), - ({"type": "json_object"}, 10, "(\\{|John)+"), - ({"type": "sound"}, 0, None), - # invalid response format (expected to fail) - ({"type": "json_object", "schema": 123}, 0, None), - ({"type": "json_object", "schema": {"type": 123}}, 0, None), - ({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None), -]) -def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None): - global server - server.start() - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": n_predicted, - "messages": [ - {"role": "system", "content": "You are a coding assistant."}, - {"role": "user", "content": "Write an example"}, - ], - "response_format": response_format, - }) - if re_content is not None: - assert res.status_code == 200 - choice = res.body["choices"][0] - assert match_regex(re_content, choice["message"]["content"]) - else: - assert res.status_code != 200 - assert "error" in res.body - - -@pytest.mark.parametrize("messages", [ - None, - "string", - [123], - [{}], - [{"role": 123}], - [{"role": "system", "content": 123}], - # [{"content": "hello"}], # TODO: should not be a valid case - [{"role": "system", "content": "test"}, {}], -]) -def test_invalid_chat_completion_req(messages): - global server - server.start() - res = server.make_request("POST", "/chat/completions", data={ - "messages": messages, - }) - assert res.status_code == 400 or res.status_code == 500 - assert "error" in res.body - - -def test_chat_completion_with_timings_per_token(): - global server - server.start() - res = server.make_stream_request("POST", "/chat/completions", data={ - "max_tokens": 10, - "messages": [{"role": "user", "content": "test"}], - "stream": True, - "timings_per_token": True, - }) - for data in res: - assert "timings" in data - assert "prompt_per_second" in data["timings"] - assert "predicted_per_second" in data["timings"] - assert "predicted_n" in data["timings"] - assert data["timings"]["predicted_n"] <= 10 - - -def test_logprobs(): - global server - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.chat.completions.create( - model="gpt-3.5-turbo-instruct", - temperature=0.0, - messages=[ - {"role": "system", "content": "Book"}, - {"role": "user", "content": "What is the best book"}, - ], - max_tokens=5, - logprobs=True, - top_logprobs=10, - ) - output_text = res.choices[0].message.content - aggregated_text = '' - assert res.choices[0].logprobs is not None - assert res.choices[0].logprobs.content is not None - for token in res.choices[0].logprobs.content: - aggregated_text += token.token - assert token.logprob <= 0.0 - assert token.bytes is not None - assert len(token.top_logprobs) > 0 - assert aggregated_text == output_text - - -def test_logprobs_stream(): - global server - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.chat.completions.create( - model="gpt-3.5-turbo-instruct", - temperature=0.0, - messages=[ - {"role": "system", "content": "Book"}, - {"role": "user", "content": "What is the best book"}, - ], - max_tokens=5, - logprobs=True, - top_logprobs=10, - stream=True, - ) - output_text = '' - aggregated_text = '' - for data in res: - choice = data.choices[0] - if choice.finish_reason is None: - if choice.delta.content: - output_text += choice.delta.content - assert choice.logprobs is not None - assert choice.logprobs.content is not None - for token in choice.logprobs.content: - aggregated_text += token.token - assert token.logprob <= 0.0 - assert token.bytes is not None - assert token.top_logprobs is not None - assert len(token.top_logprobs) > 0 - assert aggregated_text == output_text diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py deleted file mode 100644 index 0ed5b99be..000000000 --- a/examples/server/tests/unit/test_completion.py +++ /dev/null @@ -1,428 +0,0 @@ -import pytest -import requests -import time -from openai import OpenAI -from utils import * - -server = ServerPreset.tinyllama2() - - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - -@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [ - ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False), - ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True), -]) -def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool): - global server - server.start() - res = server.make_request("POST", "/completion", data={ - "n_predict": n_predict, - "prompt": prompt, - "return_tokens": return_tokens, - }) - assert res.status_code == 200 - assert res.body["timings"]["prompt_n"] == n_prompt - assert res.body["timings"]["predicted_n"] == n_predicted - assert res.body["truncated"] == truncated - assert type(res.body["has_new_line"]) == bool - assert match_regex(re_content, res.body["content"]) - if return_tokens: - assert len(res.body["tokens"]) > 0 - assert all(type(tok) == int for tok in res.body["tokens"]) - else: - assert res.body["tokens"] == [] - - -@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ - ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False), - ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False), -]) -def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool): - global server - server.start() - res = server.make_stream_request("POST", "/completion", data={ - "n_predict": n_predict, - "prompt": prompt, - "stream": True, - }) - content = "" - for data in res: - assert "stop" in data and type(data["stop"]) == bool - if data["stop"]: - assert data["timings"]["prompt_n"] == n_prompt - assert data["timings"]["predicted_n"] == n_predicted - assert data["truncated"] == truncated - assert data["stop_type"] == "limit" - assert type(data["has_new_line"]) == bool - assert "generation_settings" in data - assert server.n_predict is not None - assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict) - assert data["generation_settings"]["seed"] == server.seed - assert match_regex(re_content, content) - else: - assert len(data["tokens"]) > 0 - assert all(type(tok) == int for tok in data["tokens"]) - content += data["content"] - - -def test_completion_stream_vs_non_stream(): - global server - server.start() - res_stream = server.make_stream_request("POST", "/completion", data={ - "n_predict": 8, - "prompt": "I believe the meaning of life is", - "stream": True, - }) - res_non_stream = server.make_request("POST", "/completion", data={ - "n_predict": 8, - "prompt": "I believe the meaning of life is", - }) - content_stream = "" - for data in res_stream: - content_stream += data["content"] - assert content_stream == res_non_stream.body["content"] - - -def test_completion_with_openai_library(): - global server - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.completions.create( - model="davinci-002", - prompt="I believe the meaning of life is", - max_tokens=8, - ) - assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b") - assert res.choices[0].finish_reason == "length" - assert res.choices[0].text is not None - assert match_regex("(going|bed)+", res.choices[0].text) - - -def test_completion_stream_with_openai_library(): - global server - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.completions.create( - model="davinci-002", - prompt="I believe the meaning of life is", - max_tokens=8, - stream=True, - ) - output_text = '' - for data in res: - choice = data.choices[0] - if choice.finish_reason is None: - assert choice.text is not None - output_text += choice.text - assert match_regex("(going|bed)+", output_text) - - -@pytest.mark.parametrize("n_slots", [1, 2]) -def test_consistent_result_same_seed(n_slots: int): - global server - server.n_slots = n_slots - server.start() - last_res = None - for _ in range(4): - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "seed": 42, - "temperature": 0.0, - "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed - }) - if last_res is not None: - assert res.body["content"] == last_res.body["content"] - last_res = res - - -@pytest.mark.parametrize("n_slots", [1, 2]) -def test_different_result_different_seed(n_slots: int): - global server - server.n_slots = n_slots - server.start() - last_res = None - for seed in range(4): - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "seed": seed, - "temperature": 1.0, - "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed - }) - if last_res is not None: - assert res.body["content"] != last_res.body["content"] - last_res = res - -# TODO figure why it don't work with temperature = 1 -# @pytest.mark.parametrize("temperature", [0.0, 1.0]) -@pytest.mark.parametrize("n_batch", [16, 32]) -@pytest.mark.parametrize("temperature", [0.0]) -def test_consistent_result_different_batch_size(n_batch: int, temperature: float): - global server - server.n_batch = n_batch - server.start() - last_res = None - for _ in range(4): - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "seed": 42, - "temperature": temperature, - "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed - }) - if last_res is not None: - assert res.body["content"] == last_res.body["content"] - last_res = res - - -@pytest.mark.skip(reason="This test fails on linux, need to be fixed") -def test_cache_vs_nocache_prompt(): - global server - server.start() - res_cache = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "seed": 42, - "temperature": 1.0, - "cache_prompt": True, - }) - res_no_cache = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "seed": 42, - "temperature": 1.0, - "cache_prompt": False, - }) - assert res_cache.body["content"] == res_no_cache.body["content"] - - -def test_completion_with_tokens_input(): - global server - server.temperature = 0.0 - server.start() - prompt_str = "I believe the meaning of life is" - res = server.make_request("POST", "/tokenize", data={ - "content": prompt_str, - "add_special": True, - }) - assert res.status_code == 200 - tokens = res.body["tokens"] - - # single completion - res = server.make_request("POST", "/completion", data={ - "prompt": tokens, - }) - assert res.status_code == 200 - assert type(res.body["content"]) == str - - # batch completion - res = server.make_request("POST", "/completion", data={ - "prompt": [tokens, tokens], - }) - assert res.status_code == 200 - assert type(res.body) == list - assert len(res.body) == 2 - assert res.body[0]["content"] == res.body[1]["content"] - - # mixed string and tokens - res = server.make_request("POST", "/completion", data={ - "prompt": [tokens, prompt_str], - }) - assert res.status_code == 200 - assert type(res.body) == list - assert len(res.body) == 2 - assert res.body[0]["content"] == res.body[1]["content"] - - # mixed string and tokens in one sequence - res = server.make_request("POST", "/completion", data={ - "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str], - }) - assert res.status_code == 200 - assert type(res.body["content"]) == str - - -@pytest.mark.parametrize("n_slots,n_requests", [ - (1, 3), - (2, 2), - (2, 4), - (4, 2), # some slots must be idle - (4, 6), -]) -def test_completion_parallel_slots(n_slots: int, n_requests: int): - global server - server.n_slots = n_slots - server.temperature = 0.0 - server.start() - - PROMPTS = [ - ("Write a very long book.", "(very|special|big)+"), - ("Write another a poem.", "(small|house)+"), - ("What is LLM?", "(Dad|said)+"), - ("The sky is blue and I love it.", "(climb|leaf)+"), - ("Write another very long music lyrics.", "(friends|step|sky)+"), - ("Write a very long joke.", "(cat|Whiskers)+"), - ] - def check_slots_status(): - should_all_slots_busy = n_requests >= n_slots - time.sleep(0.1) - res = server.make_request("GET", "/slots") - n_busy = sum([1 for slot in res.body if slot["is_processing"]]) - if should_all_slots_busy: - assert n_busy == n_slots - else: - assert n_busy <= n_slots - - tasks = [] - for i in range(n_requests): - prompt, re_content = PROMPTS[i % len(PROMPTS)] - tasks.append((server.make_request, ("POST", "/completion", { - "prompt": prompt, - "seed": 42, - "temperature": 1.0, - }))) - tasks.append((check_slots_status, ())) - results = parallel_function_calls(tasks) - - # check results - for i in range(n_requests): - prompt, re_content = PROMPTS[i % len(PROMPTS)] - res = results[i] - assert res.status_code == 200 - assert type(res.body["content"]) == str - assert len(res.body["content"]) > 10 - # FIXME: the result is not deterministic when using other slot than slot 0 - # assert match_regex(re_content, res.body["content"]) - - -@pytest.mark.parametrize( - "prompt,n_predict,response_fields", - [ - ("I believe the meaning of life is", 8, []), - ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]), - ], -) -def test_completion_response_fields( - prompt: str, n_predict: int, response_fields: list[str] -): - global server - server.start() - res = server.make_request( - "POST", - "/completion", - data={ - "n_predict": n_predict, - "prompt": prompt, - "response_fields": response_fields, - }, - ) - assert res.status_code == 200 - assert "content" in res.body - assert len(res.body["content"]) - if len(response_fields): - assert res.body["generation_settings/n_predict"] == n_predict - assert res.body["prompt"] == " " + prompt - assert isinstance(res.body["content"], str) - assert len(res.body) == len(response_fields) - else: - assert len(res.body) - assert "generation_settings" in res.body - - -def test_n_probs(): - global server - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "n_probs": 10, - "temperature": 0.0, - "n_predict": 5, - }) - assert res.status_code == 200 - assert "completion_probabilities" in res.body - assert len(res.body["completion_probabilities"]) == 5 - for tok in res.body["completion_probabilities"]: - assert "id" in tok and tok["id"] > 0 - assert "token" in tok and type(tok["token"]) == str - assert "logprob" in tok and tok["logprob"] <= 0.0 - assert "bytes" in tok and type(tok["bytes"]) == list - assert len(tok["top_logprobs"]) == 10 - for prob in tok["top_logprobs"]: - assert "id" in prob and prob["id"] > 0 - assert "token" in prob and type(prob["token"]) == str - assert "logprob" in prob and prob["logprob"] <= 0.0 - assert "bytes" in prob and type(prob["bytes"]) == list - - -def test_n_probs_stream(): - global server - server.start() - res = server.make_stream_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "n_probs": 10, - "temperature": 0.0, - "n_predict": 5, - "stream": True, - }) - for data in res: - if data["stop"] == False: - assert "completion_probabilities" in data - assert len(data["completion_probabilities"]) == 1 - for tok in data["completion_probabilities"]: - assert "id" in tok and tok["id"] > 0 - assert "token" in tok and type(tok["token"]) == str - assert "logprob" in tok and tok["logprob"] <= 0.0 - assert "bytes" in tok and type(tok["bytes"]) == list - assert len(tok["top_logprobs"]) == 10 - for prob in tok["top_logprobs"]: - assert "id" in prob and prob["id"] > 0 - assert "token" in prob and type(prob["token"]) == str - assert "logprob" in prob and prob["logprob"] <= 0.0 - assert "bytes" in prob and type(prob["bytes"]) == list - - -def test_n_probs_post_sampling(): - global server - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "n_probs": 10, - "temperature": 0.0, - "n_predict": 5, - "post_sampling_probs": True, - }) - assert res.status_code == 200 - assert "completion_probabilities" in res.body - assert len(res.body["completion_probabilities"]) == 5 - for tok in res.body["completion_probabilities"]: - assert "id" in tok and tok["id"] > 0 - assert "token" in tok and type(tok["token"]) == str - assert "prob" in tok and 0.0 < tok["prob"] <= 1.0 - assert "bytes" in tok and type(tok["bytes"]) == list - assert len(tok["top_probs"]) == 10 - for prob in tok["top_probs"]: - assert "id" in prob and prob["id"] > 0 - assert "token" in prob and type(prob["token"]) == str - assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0 - assert "bytes" in prob and type(prob["bytes"]) == list - # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs - assert any(prob["prob"] == 1.0 for prob in tok["top_probs"]) - - -def test_cancel_request(): - global server - server.n_ctx = 4096 - server.n_predict = -1 - server.n_slots = 1 - server.server_slots = True - server.start() - # send a request that will take a long time, but cancel it before it finishes - try: - server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - }, timeout=0.1) - except requests.exceptions.ReadTimeout: - pass # expected - # make sure the slot is free - time.sleep(1) # wait for HTTP_POLLING_SECONDS - res = server.make_request("GET", "/slots") - assert res.body[0]["is_processing"] == False diff --git a/examples/server/tests/unit/test_ctx_shift.py b/examples/server/tests/unit/test_ctx_shift.py deleted file mode 100644 index be93a6d31..000000000 --- a/examples/server/tests/unit/test_ctx_shift.py +++ /dev/null @@ -1,67 +0,0 @@ -import pytest -from utils import * - -server = ServerPreset.tinyllama2() - - -LONG_TEXT = """ -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. -Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. -Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. -""".strip() - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - server.n_ctx = 256 - server.n_slots = 2 - - -def test_ctx_shift_enabled(): - # the prompt is 301 tokens - # the slot context is 256/2 = 128 tokens - # the prompt is truncated to keep the last 109 tokens - # 64 tokens are generated thanks to shifting the context when it gets full - global server - server.start() - res = server.make_request("POST", "/completion", data={ - "n_predict": 64, - "prompt": LONG_TEXT, - }) - assert res.status_code == 200 - assert res.body["timings"]["prompt_n"] == 109 - assert res.body["timings"]["predicted_n"] == 64 - assert res.body["truncated"] is True - - -@pytest.mark.parametrize("n_predict,n_token_output,truncated", [ - (64, 64, False), - (-1, 120, True), -]) -def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool): - global server - server.disable_ctx_shift = True - server.n_predict = -1 - server.start() - res = server.make_request("POST", "/completion", data={ - "n_predict": n_predict, - "prompt": "Hi how are you", - }) - assert res.status_code == 200 - assert res.body["timings"]["predicted_n"] == n_token_output - assert res.body["truncated"] == truncated - - -def test_ctx_shift_disabled_long_prompt(): - global server - server.disable_ctx_shift = True - server.start() - res = server.make_request("POST", "/completion", data={ - "n_predict": 64, - "prompt": LONG_TEXT, - }) - assert res.status_code != 200 - assert "error" in res.body - assert "exceeds the available context size" in res.body["error"]["message"] diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py deleted file mode 100644 index 8b0eb42b0..000000000 --- a/examples/server/tests/unit/test_embedding.py +++ /dev/null @@ -1,237 +0,0 @@ -import base64 -import struct -import pytest -from openai import OpenAI -from utils import * - -server = ServerPreset.bert_bge_small() - -EPSILON = 1e-3 - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.bert_bge_small() - - -def test_embedding_single(): - global server - server.pooling = 'last' - server.start() - res = server.make_request("POST", "/v1/embeddings", data={ - "input": "I believe the meaning of life is", - }) - assert res.status_code == 200 - assert len(res.body['data']) == 1 - assert 'embedding' in res.body['data'][0] - assert len(res.body['data'][0]['embedding']) > 1 - - # make sure embedding vector is normalized - assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON - - -def test_embedding_multiple(): - global server - server.pooling = 'last' - server.start() - res = server.make_request("POST", "/v1/embeddings", data={ - "input": [ - "I believe the meaning of life is", - "Write a joke about AI from a very long prompt which will not be truncated", - "This is a test", - "This is another test", - ], - }) - assert res.status_code == 200 - assert len(res.body['data']) == 4 - for d in res.body['data']: - assert 'embedding' in d - assert len(d['embedding']) > 1 - - -@pytest.mark.parametrize( - "input,is_multi_prompt", - [ - # do not crash on empty input - ("", False), - # single prompt - ("string", False), - ([12, 34, 56], False), - ([12, 34, "string", 56, 78], False), - # multiple prompts - (["string1", "string2"], True), - (["string1", [12, 34, 56]], True), - ([[12, 34, 56], [12, 34, 56]], True), - ([[12, 34, 56], [12, "string", 34, 56]], True), - ] -) -def test_embedding_mixed_input(input, is_multi_prompt: bool): - global server - server.start() - res = server.make_request("POST", "/v1/embeddings", data={"input": input}) - assert res.status_code == 200 - data = res.body['data'] - if is_multi_prompt: - assert len(data) == len(input) - for d in data: - assert 'embedding' in d - assert len(d['embedding']) > 1 - else: - assert 'embedding' in data[0] - assert len(data[0]['embedding']) > 1 - - -def test_embedding_pooling_none(): - global server - server.pooling = 'none' - server.start() - res = server.make_request("POST", "/embeddings", data={ - "input": "hello hello hello", - }) - assert res.status_code == 200 - assert 'embedding' in res.body[0] - assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special - - # make sure embedding vector is not normalized - for x in res.body[0]['embedding']: - assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON - - -def test_embedding_pooling_none_oai(): - global server - server.pooling = 'none' - server.start() - res = server.make_request("POST", "/v1/embeddings", data={ - "input": "hello hello hello", - }) - - # /v1/embeddings does not support pooling type 'none' - assert res.status_code == 400 - assert "error" in res.body - - -def test_embedding_openai_library_single(): - global server - server.pooling = 'last' - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is") - assert len(res.data) == 1 - assert len(res.data[0].embedding) > 1 - - -def test_embedding_openai_library_multiple(): - global server - server.pooling = 'last' - server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") - res = client.embeddings.create(model="text-embedding-3-small", input=[ - "I believe the meaning of life is", - "Write a joke about AI from a very long prompt which will not be truncated", - "This is a test", - "This is another test", - ]) - assert len(res.data) == 4 - for d in res.data: - assert len(d.embedding) > 1 - - -def test_embedding_error_prompt_too_long(): - global server - server.pooling = 'last' - server.start() - res = server.make_request("POST", "/v1/embeddings", data={ - "input": "This is a test " * 512, - }) - assert res.status_code != 200 - assert "too large" in res.body["error"]["message"] - - -def test_same_prompt_give_same_result(): - server.pooling = 'last' - server.start() - res = server.make_request("POST", "/v1/embeddings", data={ - "input": [ - "I believe the meaning of life is", - "I believe the meaning of life is", - "I believe the meaning of life is", - "I believe the meaning of life is", - "I believe the meaning of life is", - ], - }) - assert res.status_code == 200 - assert len(res.body['data']) == 5 - for i in range(1, len(res.body['data'])): - v0 = res.body['data'][0]['embedding'] - vi = res.body['data'][i]['embedding'] - for x, y in zip(v0, vi): - assert abs(x - y) < EPSILON - - -@pytest.mark.parametrize( - "content,n_tokens", - [ - ("I believe the meaning of life is", 9), - ("This is a test", 6), - ] -) -def test_embedding_usage_single(content, n_tokens): - global server - server.start() - res = server.make_request("POST", "/v1/embeddings", data={"input": content}) - assert res.status_code == 200 - assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] - assert res.body['usage']['prompt_tokens'] == n_tokens - - -def test_embedding_usage_multiple(): - global server - server.start() - res = server.make_request("POST", "/v1/embeddings", data={ - "input": [ - "I believe the meaning of life is", - "I believe the meaning of life is", - ], - }) - assert res.status_code == 200 - assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] - assert res.body['usage']['prompt_tokens'] == 2 * 9 - - -def test_embedding_openai_library_base64(): - server.start() - test_input = "Test base64 embedding output" - - # get embedding in default format - res = server.make_request("POST", "/v1/embeddings", data={ - "input": test_input - }) - assert res.status_code == 200 - vec0 = res.body["data"][0]["embedding"] - - # get embedding in base64 format - res = server.make_request("POST", "/v1/embeddings", data={ - "input": test_input, - "encoding_format": "base64" - }) - - assert res.status_code == 200 - assert "data" in res.body - assert len(res.body["data"]) == 1 - - embedding_data = res.body["data"][0] - assert "embedding" in embedding_data - assert isinstance(embedding_data["embedding"], str) - - # Verify embedding is valid base64 - decoded = base64.b64decode(embedding_data["embedding"]) - # Verify decoded data can be converted back to float array - float_count = len(decoded) // 4 # 4 bytes per float - floats = struct.unpack(f'{float_count}f', decoded) - assert len(floats) > 0 - assert all(isinstance(x, float) for x in floats) - assert len(floats) == len(vec0) - - # make sure the decoded data is the same as the original - for x, y in zip(floats, vec0): - assert abs(x - y) < EPSILON diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py deleted file mode 100644 index 10554db0f..000000000 --- a/examples/server/tests/unit/test_infill.py +++ /dev/null @@ -1,77 +0,0 @@ -import pytest -from utils import * - -server = ServerPreset.tinyllama_infill() - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama_infill() - - -def test_infill_without_input_extra(): - global server - server.start() - res = server.make_request("POST", "/infill", data={ - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", - "prompt": " int n_threads = llama_", - "input_suffix": "}\n", - }) - assert res.status_code == 200 - assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"]) - - -def test_infill_with_input_extra(): - global server - server.start() - res = server.make_request("POST", "/infill", data={ - "input_extra": [{ - "filename": "llama.h", - "text": "LLAMA_API int32_t llama_n_threads();\n" - }], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", - "prompt": " int n_threads = llama_", - "input_suffix": "}\n", - }) - assert res.status_code == 200 - assert match_regex("(Dad|excited|park)+", res.body["content"]) - - -@pytest.mark.parametrize("input_extra", [ - {}, - {"filename": "ok"}, - {"filename": 123}, - {"filename": 123, "text": "abc"}, - {"filename": 123, "text": 456}, -]) -def test_invalid_input_extra_req(input_extra): - global server - server.start() - res = server.make_request("POST", "/infill", data={ - "input_extra": [input_extra], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", - "prompt": " int n_threads = llama_", - "input_suffix": "}\n", - }) - assert res.status_code == 400 - assert "error" in res.body - - -@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test") -def test_with_qwen_model(): - global server - server.model_file = None - server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF" - server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf" - server.start(timeout_seconds=600) - res = server.make_request("POST", "/infill", data={ - "input_extra": [{ - "filename": "llama.h", - "text": "LLAMA_API int32_t llama_n_threads();\n" - }], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", - "prompt": " int n_threads = llama_", - "input_suffix": "}\n", - }) - assert res.status_code == 200 - assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n" diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py deleted file mode 100644 index c1aa8be70..000000000 --- a/examples/server/tests/unit/test_lora.py +++ /dev/null @@ -1,115 +0,0 @@ -import pytest -from utils import * - -server = ServerPreset.stories15m_moe() - -LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf" - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.stories15m_moe() - server.lora_files = [download_file(LORA_FILE_URL)] - - -@pytest.mark.parametrize("scale,re_content", [ - # without applying lora, the model should behave like a bedtime story generator - (0.0, "(little|girl|three|years|old)+"), - # with lora, the model should behave like a Shakespearean text generator - (1.0, "(eye|love|glass|sun)+"), -]) -def test_lora(scale: float, re_content: str): - global server - server.start() - res_lora_control = server.make_request("POST", "/lora-adapters", data=[ - {"id": 0, "scale": scale} - ]) - assert res_lora_control.status_code == 200 - res = server.make_request("POST", "/completion", data={ - "prompt": "Look in thy glass", - }) - assert res.status_code == 200 - assert match_regex(re_content, res.body["content"]) - - -def test_lora_per_request(): - global server - server.n_slots = 4 - server.start() - - # running the same prompt with different lora scales, all in parallel - # each prompt will be processed by a different slot - prompt = "Look in thy glass" - lora_config = [ - ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), - ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), - ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ), - ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ), - ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), - ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), - ] - - tasks = [( - server.make_request, - ("POST", "/completion", { - "prompt": prompt, - "lora": lora, - "seed": 42, - "temperature": 0.0, - "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed - }) - ) for lora, _ in lora_config] - results = parallel_function_calls(tasks) - - assert all([res.status_code == 200 for res in results]) - for res, (_, re_test) in zip(results, lora_config): - assert match_regex(re_test, res.body["content"]) - - -@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test") -def test_with_big_model(): - server = ServerProcess() - server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" - server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf" - server.model_alias = "Llama-3.2-8B-Instruct" - server.n_slots = 4 - server.n_ctx = server.n_slots * 1024 - server.n_predict = 64 - server.temperature = 0.0 - server.seed = 42 - server.lora_files = [ - download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"), - # TODO: find & add other lora adapters for this model - ] - server.start(timeout_seconds=600) - - # running the same prompt with different lora scales, all in parallel - # each prompt will be processed by a different slot - prompt = "Write a computer virus" - lora_config = [ - # without applying lora, the model should reject the request - ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ), - ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ), - ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ), - # with 0.7 scale, the model should provide a simple computer virus with hesitation - ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ), - # with 1.5 scale, the model should confidently provide a computer virus - ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ), - ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ), - ] - - tasks = [( - server.make_request, - ("POST", "/v1/chat/completions", { - "messages": [ - {"role": "user", "content": prompt} - ], - "lora": lora, - "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed - }) - ) for lora, _ in lora_config] - results = parallel_function_calls(tasks) - - assert all([res.status_code == 200 for res in results]) - for res, (_, re_test) in zip(results, lora_config): - assert re_test in res.body["choices"][0]["message"]["content"] diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py deleted file mode 100644 index 7203d7943..000000000 --- a/examples/server/tests/unit/test_rerank.py +++ /dev/null @@ -1,78 +0,0 @@ -import pytest -from utils import * - -server = ServerPreset.jina_reranker_tiny() - - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.jina_reranker_tiny() - - -def test_rerank(): - global server - server.start() - res = server.make_request("POST", "/rerank", data={ - "query": "Machine learning is", - "documents": [ - "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.", - "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.", - "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.", - "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine." - ] - }) - assert res.status_code == 200 - assert len(res.body["results"]) == 4 - - most_relevant = res.body["results"][0] - least_relevant = res.body["results"][0] - for doc in res.body["results"]: - if doc["relevance_score"] > most_relevant["relevance_score"]: - most_relevant = doc - if doc["relevance_score"] < least_relevant["relevance_score"]: - least_relevant = doc - - assert most_relevant["relevance_score"] > least_relevant["relevance_score"] - assert most_relevant["index"] == 2 - assert least_relevant["index"] == 3 - - -@pytest.mark.parametrize("documents", [ - [], - None, - 123, - [1, 2, 3], -]) -def test_invalid_rerank_req(documents): - global server - server.start() - res = server.make_request("POST", "/rerank", data={ - "query": "Machine learning is", - "documents": documents, - }) - assert res.status_code == 400 - assert "error" in res.body - - -@pytest.mark.parametrize( - "query,doc1,doc2,n_tokens", - [ - ("Machine learning is", "A machine", "Learning is", 19), - ("Which city?", "Machine learning is ", "Paris, capitale de la", 26), - ] -) -def test_rerank_usage(query, doc1, doc2, n_tokens): - global server - server.start() - - res = server.make_request("POST", "/rerank", data={ - "query": query, - "documents": [ - doc1, - doc2, - ] - }) - assert res.status_code == 200 - assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] - assert res.body['usage']['prompt_tokens'] == n_tokens diff --git a/examples/server/tests/unit/test_security.py b/examples/server/tests/unit/test_security.py deleted file mode 100644 index 620b25376..000000000 --- a/examples/server/tests/unit/test_security.py +++ /dev/null @@ -1,83 +0,0 @@ -import pytest -from openai import OpenAI -from utils import * - -server = ServerPreset.tinyllama2() - -TEST_API_KEY = "sk-this-is-the-secret-key" - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - server.api_key = TEST_API_KEY - - -@pytest.mark.parametrize("endpoint", ["/health", "/models"]) -def test_access_public_endpoint(endpoint: str): - global server - server.start() - res = server.make_request("GET", endpoint) - assert res.status_code == 200 - assert "error" not in res.body - - -@pytest.mark.parametrize("api_key", [None, "invalid-key"]) -def test_incorrect_api_key(api_key: str): - global server - server.start() - res = server.make_request("POST", "/completions", data={ - "prompt": "I believe the meaning of life is", - }, headers={ - "Authorization": f"Bearer {api_key}" if api_key else None, - }) - assert res.status_code == 401 - assert "error" in res.body - assert res.body["error"]["type"] == "authentication_error" - - -def test_correct_api_key(): - global server - server.start() - res = server.make_request("POST", "/completions", data={ - "prompt": "I believe the meaning of life is", - }, headers={ - "Authorization": f"Bearer {TEST_API_KEY}", - }) - assert res.status_code == 200 - assert "error" not in res.body - assert "content" in res.body - - -def test_openai_library_correct_api_key(): - global server - server.start() - client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}") - res = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a chatbot."}, - {"role": "user", "content": "What is the meaning of life?"}, - ], - ) - assert len(res.choices) == 1 - - -@pytest.mark.parametrize("origin,cors_header,cors_header_value", [ - ("localhost", "Access-Control-Allow-Origin", "localhost"), - ("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"), - ("origin", "Access-Control-Allow-Credentials", "true"), - ("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"), - ("web.mydomain.fr", "Access-Control-Allow-Headers", "*"), -]) -def test_cors_options(origin: str, cors_header: str, cors_header_value: str): - global server - server.start() - res = server.make_request("OPTIONS", "/completions", headers={ - "Origin": origin, - "Access-Control-Request-Method": "POST", - "Access-Control-Request-Headers": "Authorization", - }) - assert res.status_code == 200 - assert cors_header in res.headers - assert res.headers[cors_header] == cors_header_value diff --git a/examples/server/tests/unit/test_slot_save.py b/examples/server/tests/unit/test_slot_save.py deleted file mode 100644 index 38704f5ec..000000000 --- a/examples/server/tests/unit/test_slot_save.py +++ /dev/null @@ -1,98 +0,0 @@ -import pytest -from utils import * - -server = ServerPreset.tinyllama2() - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - server.slot_save_path = "./tmp" - server.temperature = 0.0 - - -def test_slot_save_restore(): - global server - server.start() - - # First prompt in slot 1 should be fully processed - res = server.make_request("POST", "/completion", data={ - "prompt": "What is the capital of France?", - "id_slot": 1, - "cache_prompt": True, - }) - assert res.status_code == 200 - assert match_regex("(Whiskers|Flana)+", res.body["content"]) - assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed - - # Save state of slot 1 - res = server.make_request("POST", "/slots/1?action=save", data={ - "filename": "slot1.bin", - }) - assert res.status_code == 200 - assert res.body["n_saved"] == 84 - - # Since we have cache, this should only process the last tokens - res = server.make_request("POST", "/completion", data={ - "prompt": "What is the capital of Germany?", - "id_slot": 1, - "cache_prompt": True, - }) - assert res.status_code == 200 - assert match_regex("(Jack|said)+", res.body["content"]) - assert res.body["timings"]["prompt_n"] == 6 # only different part is processed - - # Loading the saved cache into slot 0 - res = server.make_request("POST", "/slots/0?action=restore", data={ - "filename": "slot1.bin", - }) - assert res.status_code == 200 - assert res.body["n_restored"] == 84 - - # Since we have cache, slot 0 should only process the last tokens - res = server.make_request("POST", "/completion", data={ - "prompt": "What is the capital of Germany?", - "id_slot": 0, - "cache_prompt": True, - }) - assert res.status_code == 200 - assert match_regex("(Jack|said)+", res.body["content"]) - assert res.body["timings"]["prompt_n"] == 6 # only different part is processed - - # For verification that slot 1 was not corrupted during slot 0 load, same thing should work - res = server.make_request("POST", "/completion", data={ - "prompt": "What is the capital of Germany?", - "id_slot": 1, - "cache_prompt": True, - }) - assert res.status_code == 200 - assert match_regex("(Jack|said)+", res.body["content"]) - assert res.body["timings"]["prompt_n"] == 1 - - -def test_slot_erase(): - global server - server.start() - - res = server.make_request("POST", "/completion", data={ - "prompt": "What is the capital of France?", - "id_slot": 1, - "cache_prompt": True, - }) - assert res.status_code == 200 - assert match_regex("(Whiskers|Flana)+", res.body["content"]) - assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed - - # erase slot 1 - res = server.make_request("POST", "/slots/1?action=erase") - assert res.status_code == 200 - - # re-run the same prompt, it should process all tokens again - res = server.make_request("POST", "/completion", data={ - "prompt": "What is the capital of France?", - "id_slot": 1, - "cache_prompt": True, - }) - assert res.status_code == 200 - assert match_regex("(Whiskers|Flana)+", res.body["content"]) - assert res.body["timings"]["prompt_n"] == 21 # all tokens are processed diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py deleted file mode 100644 index 54db38cf3..000000000 --- a/examples/server/tests/unit/test_speculative.py +++ /dev/null @@ -1,126 +0,0 @@ -import pytest -from utils import * - -# We use a F16 MOE gguf as main model, and q4_0 as draft model - -server = ServerPreset.stories15m_moe() - -MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf" - -def create_server(): - global server - server = ServerPreset.stories15m_moe() - # set default values - server.model_draft = download_file(MODEL_DRAFT_FILE_URL) - server.draft_min = 4 - server.draft_max = 8 - - -@pytest.fixture(scope="module", autouse=True) -def fixture_create_server(): - return create_server() - - -def test_with_and_without_draft(): - global server - server.model_draft = None # disable draft model - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "temperature": 0.0, - "top_k": 1, - }) - assert res.status_code == 200 - content_no_draft = res.body["content"] - server.stop() - - # create new server with draft model - create_server() - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "temperature": 0.0, - "top_k": 1, - }) - assert res.status_code == 200 - content_draft = res.body["content"] - - assert content_no_draft == content_draft - - -def test_different_draft_min_draft_max(): - global server - test_values = [ - (1, 2), - (1, 4), - (4, 8), - (4, 12), - (8, 16), - ] - last_content = None - for draft_min, draft_max in test_values: - server.stop() - server.draft_min = draft_min - server.draft_max = draft_max - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "I believe the meaning of life is", - "temperature": 0.0, - "top_k": 1, - }) - assert res.status_code == 200 - if last_content is not None: - assert last_content == res.body["content"] - last_content = res.body["content"] - - -def test_slot_ctx_not_exceeded(): - global server - server.n_ctx = 64 - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "Hello " * 56, - "temperature": 0.0, - "top_k": 1, - "speculative.p_min": 0.0, - }) - assert res.status_code == 200 - assert len(res.body["content"]) > 0 - - -def test_with_ctx_shift(): - global server - server.n_ctx = 64 - server.start() - res = server.make_request("POST", "/completion", data={ - "prompt": "Hello " * 56, - "temperature": 0.0, - "top_k": 1, - "n_predict": 64, - "speculative.p_min": 0.0, - }) - assert res.status_code == 200 - assert len(res.body["content"]) > 0 - assert res.body["tokens_predicted"] == 64 - assert res.body["truncated"] == True - - -@pytest.mark.parametrize("n_slots,n_requests", [ - (1, 2), - (2, 2), -]) -def test_multi_requests_parallel(n_slots: int, n_requests: int): - global server - server.n_slots = n_slots - server.start() - tasks = [] - for _ in range(n_requests): - tasks.append((server.make_request, ("POST", "/completion", { - "prompt": "I believe the meaning of life is", - "temperature": 0.0, - "top_k": 1, - }))) - results = parallel_function_calls(tasks) - for res in results: - assert res.status_code == 200 - assert match_regex("(wise|kind|owl|answer)+", res.body["content"]) diff --git a/examples/server/tests/unit/test_tokenize.py b/examples/server/tests/unit/test_tokenize.py deleted file mode 100644 index 382457c9d..000000000 --- a/examples/server/tests/unit/test_tokenize.py +++ /dev/null @@ -1,59 +0,0 @@ -import pytest -from utils import * - -server = ServerPreset.tinyllama2() - - -@pytest.fixture(scope="module", autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - - -def test_tokenize_detokenize(): - global server - server.start() - # tokenize - content = "What is the capital of France ?" - res_tok = server.make_request("POST", "/tokenize", data={ - "content": content - }) - assert res_tok.status_code == 200 - assert len(res_tok.body["tokens"]) > 5 - # detokenize - res_detok = server.make_request("POST", "/detokenize", data={ - "tokens": res_tok.body["tokens"], - }) - assert res_detok.status_code == 200 - assert res_detok.body["content"].strip() == content - - -def test_tokenize_with_bos(): - global server - server.start() - # tokenize - content = "What is the capital of France ?" - bosId = 1 - res_tok = server.make_request("POST", "/tokenize", data={ - "content": content, - "add_special": True, - }) - assert res_tok.status_code == 200 - assert res_tok.body["tokens"][0] == bosId - - -def test_tokenize_with_pieces(): - global server - server.start() - # tokenize - content = "This is a test string with unicode 媽 and emoji 🤗" - res_tok = server.make_request("POST", "/tokenize", data={ - "content": content, - "with_pieces": True, - }) - assert res_tok.status_code == 200 - for token in res_tok.body["tokens"]: - assert "id" in token - assert token["id"] > 0 - assert "piece" in token - assert len(token["piece"]) > 0 diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py deleted file mode 100644 index 4a551404f..000000000 --- a/examples/server/tests/unit/test_tool_call.py +++ /dev/null @@ -1,418 +0,0 @@ -import pytest -from utils import * - -server: ServerProcess - -TIMEOUT_SERVER_START = 15*60 -TIMEOUT_HTTP_REQUEST = 60 - -@pytest.fixture(autouse=True) -def create_server(): - global server - server = ServerPreset.tinyllama2() - server.model_alias = "tinyllama-2-tool-call" - server.server_port = 8081 - - -TEST_TOOL = { - "type":"function", - "function": { - "name": "test", - "description": "", - "parameters": { - "type": "object", - "properties": { - "success": {"type": "boolean", "const": True}, - }, - "required": ["success"] - } - } -} - -PYTHON_TOOL = { - "type": "function", - "function": { - "name": "python", - "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", - "parameters": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "The code to run in the ipython interpreter." - } - }, - "required": ["code"] - } - } -} - -WEATHER_TOOL = { - "type":"function", - "function":{ - "name":"get_current_weather", - "description":"Get the current weather in a given location", - "parameters":{ - "type":"object", - "properties":{ - "location":{ - "type":"string", - "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'" - } - }, - "required":["location"] - } - } -} - - -def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None): - global server - n_predict = 512 - # server = ServerPreset.stories15m_moe() - server.jinja = True - server.n_predict = n_predict - server.chat_template_file = f'../../../models/templates/{template_name}.jinja' - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": n_predict, - "messages": [ - {"role": "system", "content": "You are a coding assistant."}, - {"role": "user", "content": "Write an example"}, - ], - "tool_choice": "required", - "tools": [tool], - "parallel_tool_calls": False, - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, - }) - assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" - choice = res.body["choices"][0] - tool_calls = choice["message"].get("tool_calls") - assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' - tool_call = tool_calls[0] - expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] - assert expected_function_name == tool_call["function"]["name"] - actual_arguments = tool_call["function"]["arguments"] - assert isinstance(actual_arguments, str) - if argument_key is not None: - actual_arguments = json.loads(actual_arguments) - assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}" - - -@pytest.mark.parametrize("template_name,tool,argument_key", [ - ("google-gemma-2-2b-it", TEST_TOOL, "success"), - ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"), -]) -def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None): - do_test_completion_with_required_tool_tiny(template_name, tool, argument_key) - - -@pytest.mark.slow -@pytest.mark.parametrize("template_name,tool,argument_key", [ - ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"), - ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"), - ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"), - ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"), - ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"), - ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"), - ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"), - ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"), - ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"), - ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"), - ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"), - ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"), - ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"), - ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"), - ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"), - ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"), -]) -def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None): - do_test_completion_with_required_tool_tiny(template_name, tool, argument_key) - - -@pytest.mark.slow -@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [ - (TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), - - # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. - (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), - - (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), - (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), - (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)), - (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)), - (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), - - (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), - # TODO: fix these - # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), -]) -def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): - global server - n_predict = 512 - server.n_slots = 1 - server.jinja = True - server.n_ctx = 8192 - server.n_predict = n_predict - server.model_hf_repo = hf_repo - server.model_hf_file = None - if isinstance(template_override, tuple): - (template_hf_repo, template_variant) = template_override - server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" - assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." - elif isinstance(template_override, str): - server.chat_template = template_override - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": n_predict, - "messages": [ - {"role": "system", "content": "You are a coding assistant."}, - {"role": "user", "content": "Write an example"}, - ], - "tool_choice": "required", - "tools": [tool], - "parallel_tool_calls": False, - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, - }, timeout=TIMEOUT_HTTP_REQUEST) - assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" - choice = res.body["choices"][0] - tool_calls = choice["message"].get("tool_calls") - assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' - tool_call = tool_calls[0] - expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] - assert expected_function_name == tool_call["function"]["name"] - actual_arguments = tool_call["function"]["arguments"] - assert isinstance(actual_arguments, str) - if argument_key is not None: - actual_arguments = json.loads(actual_arguments) - assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}" - - -def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): - global server - server.jinja = True - server.n_predict = n_predict - server.chat_template_file = f'../../../models/templates/{template_name}.jinja' - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": n_predict, - "messages": [ - {"role": "system", "content": "You are a coding assistant."}, - {"role": "user", "content": "say hello world with python"}, - ], - "tools": tools if tools else None, - "tool_choice": tool_choice, - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, - }, timeout=TIMEOUT_HTTP_REQUEST) - assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" - choice = res.body["choices"][0] - assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}' - - -@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [ - ("meta-llama-Llama-3.3-70B-Instruct", 128, [], None), - ("meta-llama-Llama-3.3-70B-Instruct", 128, [TEST_TOOL], None), - ("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'), -]) -def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): - do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice) - - -@pytest.mark.slow -@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [ - ("meetkai-functionary-medium-v3.2", 256, [], None), - ("meetkai-functionary-medium-v3.2", 256, [TEST_TOOL], None), - ("meetkai-functionary-medium-v3.2", 256, [PYTHON_TOOL], 'none'), - ("meetkai-functionary-medium-v3.1", 256, [], None), - ("meetkai-functionary-medium-v3.1", 256, [TEST_TOOL], None), - ("meetkai-functionary-medium-v3.1", 256, [PYTHON_TOOL], 'none'), - ("meta-llama-Llama-3.2-3B-Instruct", 256, [], None), - ("meta-llama-Llama-3.2-3B-Instruct", 256, [TEST_TOOL], None), - ("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'), -]) -def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): - do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice) - - -@pytest.mark.slow -@pytest.mark.parametrize("hf_repo,template_override", [ - ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")), - ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), - - ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - - ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), - - ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), - - ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), - ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), - - ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), - - ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), - ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), - - ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), - - # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. - ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), - - # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), -]) -def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None): - global server - n_predict = 512 - server.n_slots = 1 - server.jinja = True - server.n_ctx = 8192 - server.n_predict = n_predict - server.model_hf_repo = hf_repo - server.model_hf_file = None - if isinstance(template_override, tuple): - (template_hf_repo, template_variant) = template_override - server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" - assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." - elif isinstance(template_override, str): - server.chat_template = template_override - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": n_predict, - "messages": [ - {"role": "user", "content": "What is the weather in Istanbul?"}, - ], - "tools": [WEATHER_TOOL], - }, timeout=TIMEOUT_HTTP_REQUEST) - assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" - choice = res.body["choices"][0] - tool_calls = choice["message"].get("tool_calls") - assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' - tool_call = tool_calls[0] - assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"] - actual_arguments = json.loads(tool_call["function"]["arguments"]) - assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}" - location = actual_arguments["location"] - assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}" - assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}' - - -@pytest.mark.slow -@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [ - (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - - (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), - (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), - - (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), - - ('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), - (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), - - ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), - ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), - - (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), - - (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), - - (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), - (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), - - (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), - - # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. - (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), - - # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), -]) -def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): - global server - server.n_slots = 1 - server.jinja = True - server.n_ctx = 8192 - server.n_predict = 128 - server.model_hf_repo = hf_repo - server.model_hf_file = None - if isinstance(template_override, tuple): - (template_hf_repo, template_variant) = template_override - server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" - assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." - elif isinstance(template_override, str): - server.chat_template = template_override - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": 256, - "messages": [ - {"role": "system", "content": "You are a coding assistant."}, - {"role": "user", "content": "say hello world with python"}, - ], - "tools": [PYTHON_TOOL], - # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n print("Hello, World!")\nhello_world()` which is correct but a pain to test. - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, - }, timeout=TIMEOUT_HTTP_REQUEST) - assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" - choice = res.body["choices"][0] - tool_calls = choice["message"].get("tool_calls") - assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' - tool_call = tool_calls[0] - assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"] - actual_arguments = tool_call["function"]["arguments"] - if expected_arguments_override is not None: - assert actual_arguments == expected_arguments_override - else: - actual_arguments = json.loads(actual_arguments) - assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}" - code = actual_arguments["code"] - assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}" - assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}' diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py deleted file mode 100644 index ce0680662..000000000 --- a/examples/server/tests/utils.py +++ /dev/null @@ -1,415 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# type: ignore[reportUnusedImport] - -import subprocess -import os -import re -import json -import sys -import requests -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import ( - Any, - Callable, - ContextManager, - Iterable, - Iterator, - List, - Literal, - Tuple, - Set, -) -from re import RegexFlag -import wget - - -DEFAULT_HTTP_TIMEOUT = 12 if "LLAMA_SANITIZE" not in os.environ else 30 - - -class ServerResponse: - headers: dict - status_code: int - body: dict | Any - - -class ServerProcess: - # default options - debug: bool = False - server_port: int = 8080 - server_host: str = "127.0.0.1" - model_hf_repo: str = "ggml-org/models" - model_hf_file: str | None = "tinyllamas/stories260K.gguf" - model_alias: str = "tinyllama-2" - temperature: float = 0.8 - seed: int = 42 - - # custom options - model_alias: str | None = None - model_url: str | None = None - model_file: str | None = None - model_draft: str | None = None - n_threads: int | None = None - n_gpu_layer: int | None = None - n_batch: int | None = None - n_ubatch: int | None = None - n_ctx: int | None = None - n_ga: int | None = None - n_ga_w: int | None = None - n_predict: int | None = None - n_prompts: int | None = 0 - slot_save_path: str | None = None - id_slot: int | None = None - cache_prompt: bool | None = None - n_slots: int | None = None - server_continuous_batching: bool | None = False - server_embeddings: bool | None = False - server_reranking: bool | None = False - server_metrics: bool | None = False - server_slots: bool | None = False - pooling: str | None = None - draft: int | None = None - api_key: str | None = None - lora_files: List[str] | None = None - disable_ctx_shift: int | None = False - draft_min: int | None = None - draft_max: int | None = None - no_webui: bool | None = None - jinja: bool | None = None - chat_template: str | None = None - chat_template_file: str | None = None - - # session variables - process: subprocess.Popen | None = None - - def __init__(self): - if "N_GPU_LAYERS" in os.environ: - self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"]) - if "DEBUG" in os.environ: - self.debug = True - if "PORT" in os.environ: - self.server_port = int(os.environ["PORT"]) - - def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: - if "LLAMA_SERVER_BIN_PATH" in os.environ: - server_path = os.environ["LLAMA_SERVER_BIN_PATH"] - elif os.name == "nt": - server_path = "../../../build/bin/Release/llama-server.exe" - else: - server_path = "../../../build/bin/llama-server" - server_args = [ - "--host", - self.server_host, - "--port", - self.server_port, - "--temp", - self.temperature, - "--seed", - self.seed, - ] - if self.model_file: - server_args.extend(["--model", self.model_file]) - if self.model_url: - server_args.extend(["--model-url", self.model_url]) - if self.model_draft: - server_args.extend(["--model-draft", self.model_draft]) - if self.model_hf_repo: - server_args.extend(["--hf-repo", self.model_hf_repo]) - if self.model_hf_file: - server_args.extend(["--hf-file", self.model_hf_file]) - if self.n_batch: - server_args.extend(["--batch-size", self.n_batch]) - if self.n_ubatch: - server_args.extend(["--ubatch-size", self.n_ubatch]) - if self.n_threads: - server_args.extend(["--threads", self.n_threads]) - if self.n_gpu_layer: - server_args.extend(["--n-gpu-layers", self.n_gpu_layer]) - if self.draft is not None: - server_args.extend(["--draft", self.draft]) - if self.server_continuous_batching: - server_args.append("--cont-batching") - if self.server_embeddings: - server_args.append("--embedding") - if self.server_reranking: - server_args.append("--reranking") - if self.server_metrics: - server_args.append("--metrics") - if self.server_slots: - server_args.append("--slots") - if self.pooling: - server_args.extend(["--pooling", self.pooling]) - if self.model_alias: - server_args.extend(["--alias", self.model_alias]) - if self.n_ctx: - server_args.extend(["--ctx-size", self.n_ctx]) - if self.n_slots: - server_args.extend(["--parallel", self.n_slots]) - if self.n_predict: - server_args.extend(["--n-predict", self.n_predict]) - if self.slot_save_path: - server_args.extend(["--slot-save-path", self.slot_save_path]) - if self.n_ga: - server_args.extend(["--grp-attn-n", self.n_ga]) - if self.n_ga_w: - server_args.extend(["--grp-attn-w", self.n_ga_w]) - if self.debug: - server_args.append("--verbose") - if self.lora_files: - for lora_file in self.lora_files: - server_args.extend(["--lora", lora_file]) - if self.disable_ctx_shift: - server_args.extend(["--no-context-shift"]) - if self.api_key: - server_args.extend(["--api-key", self.api_key]) - if self.draft_max: - server_args.extend(["--draft-max", self.draft_max]) - if self.draft_min: - server_args.extend(["--draft-min", self.draft_min]) - if self.no_webui: - server_args.append("--no-webui") - if self.jinja: - server_args.append("--jinja") - if self.chat_template: - server_args.extend(["--chat-template", self.chat_template]) - if self.chat_template_file: - server_args.extend(["--chat-template-file", self.chat_template_file]) - - args = [str(arg) for arg in [server_path, *server_args]] - print(f"bench: starting server with: {' '.join(args)}") - - flags = 0 - if "nt" == os.name: - flags |= subprocess.DETACHED_PROCESS - flags |= subprocess.CREATE_NEW_PROCESS_GROUP - flags |= subprocess.CREATE_NO_WINDOW - - self.process = subprocess.Popen( - [str(arg) for arg in [server_path, *server_args]], - creationflags=flags, - stdout=sys.stdout, - stderr=sys.stdout, - env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None, - ) - server_instances.add(self) - - print(f"server pid={self.process.pid}, pytest pid={os.getpid()}") - - # wait for server to start - start_time = time.time() - while time.time() - start_time < timeout_seconds: - try: - response = self.make_request("GET", "/health", headers={ - "Authorization": f"Bearer {self.api_key}" if self.api_key else None - }) - if response.status_code == 200: - self.ready = True - return # server is ready - except Exception as e: - pass - print(f"Waiting for server to start...") - time.sleep(0.5) - raise TimeoutError(f"Server did not start within {timeout_seconds} seconds") - - def stop(self) -> None: - if self in server_instances: - server_instances.remove(self) - if self.process: - print(f"Stopping server with pid={self.process.pid}") - self.process.kill() - self.process = None - - def make_request( - self, - method: str, - path: str, - data: dict | Any | None = None, - headers: dict | None = None, - timeout: float | None = None, - ) -> ServerResponse: - url = f"http://{self.server_host}:{self.server_port}{path}" - parse_body = False - if method == "GET": - response = requests.get(url, headers=headers, timeout=timeout) - parse_body = True - elif method == "POST": - response = requests.post(url, headers=headers, json=data, timeout=timeout) - parse_body = True - elif method == "OPTIONS": - response = requests.options(url, headers=headers, timeout=timeout) - else: - raise ValueError(f"Unimplemented method: {method}") - result = ServerResponse() - result.headers = dict(response.headers) - result.status_code = response.status_code - result.body = response.json() if parse_body else None - print("Response from server", json.dumps(result.body, indent=2)) - return result - - def make_stream_request( - self, - method: str, - path: str, - data: dict | None = None, - headers: dict | None = None, - ) -> Iterator[dict]: - url = f"http://{self.server_host}:{self.server_port}{path}" - if method == "POST": - response = requests.post(url, headers=headers, json=data, stream=True) - else: - raise ValueError(f"Unimplemented method: {method}") - for line_bytes in response.iter_lines(): - line = line_bytes.decode("utf-8") - if '[DONE]' in line: - break - elif line.startswith('data: '): - data = json.loads(line[6:]) - print("Partial response from server", json.dumps(data, indent=2)) - yield data - - -server_instances: Set[ServerProcess] = set() - - -class ServerPreset: - @staticmethod - def tinyllama2() -> ServerProcess: - server = ServerProcess() - server.model_hf_repo = "ggml-org/models" - server.model_hf_file = "tinyllamas/stories260K.gguf" - server.model_alias = "tinyllama-2" - server.n_ctx = 256 - server.n_batch = 32 - server.n_slots = 2 - server.n_predict = 64 - server.seed = 42 - return server - - @staticmethod - def bert_bge_small() -> ServerProcess: - server = ServerProcess() - server.model_hf_repo = "ggml-org/models" - server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf" - server.model_alias = "bert-bge-small" - server.n_ctx = 512 - server.n_batch = 128 - server.n_ubatch = 128 - server.n_slots = 2 - server.seed = 42 - server.server_embeddings = True - return server - - @staticmethod - def tinyllama_infill() -> ServerProcess: - server = ServerProcess() - server.model_hf_repo = "ggml-org/models" - server.model_hf_file = "tinyllamas/stories260K-infill.gguf" - server.model_alias = "tinyllama-infill" - server.n_ctx = 2048 - server.n_batch = 1024 - server.n_slots = 1 - server.n_predict = 64 - server.temperature = 0.0 - server.seed = 42 - return server - - @staticmethod - def stories15m_moe() -> ServerProcess: - server = ServerProcess() - server.model_hf_repo = "ggml-org/stories15M_MOE" - server.model_hf_file = "stories15M_MOE-F16.gguf" - server.model_alias = "stories15m-moe" - server.n_ctx = 2048 - server.n_batch = 1024 - server.n_slots = 1 - server.n_predict = 64 - server.temperature = 0.0 - server.seed = 42 - return server - - @staticmethod - def jina_reranker_tiny() -> ServerProcess: - server = ServerProcess() - server.model_hf_repo = "ggml-org/models" - server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf" - server.model_alias = "jina-reranker" - server.n_ctx = 512 - server.n_batch = 512 - server.n_slots = 1 - server.seed = 42 - server.server_reranking = True - return server - - -def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]: - """ - Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS. - - Example usage: - - results = parallel_function_calls([ - (func1, (arg1, arg2)), - (func2, (arg3, arg4)), - ]) - """ - results = [None] * len(function_list) - exceptions = [] - - def worker(index, func, args): - try: - result = func(*args) - results[index] = result - except Exception as e: - exceptions.append((index, str(e))) - - with ThreadPoolExecutor() as executor: - futures = [] - for i, (func, args) in enumerate(function_list): - future = executor.submit(worker, i, func, args) - futures.append(future) - - # Wait for all futures to complete - for future in as_completed(futures): - pass - - # Check if there were any exceptions - if exceptions: - print("Exceptions occurred:") - for index, error in exceptions: - print(f"Function at index {index}: {error}") - - return results - - -def match_regex(regex: str, text: str) -> bool: - return ( - re.compile( - regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL - ).search(text) - is not None - ) - - -def download_file(url: str, output_file_path: str | None = None) -> str: - """ - Download a file from a URL to a local path. If the file already exists, it will not be downloaded again. - - output_file_path is the local path to save the downloaded file. If not provided, the file will be saved in the root directory. - - Returns the local path of the downloaded file. - """ - file_name = url.split('/').pop() - output_file = f'./tmp/{file_name}' if output_file_path is None else output_file_path - if not os.path.exists(output_file): - print(f"Downloading {url} to {output_file}") - wget.download(url, out=output_file) - print(f"Done downloading to {output_file}") - else: - print(f"File already exists at {output_file}") - return output_file - - -def is_slow_test_allowed(): - return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON" diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html index 3fb88fcc8..8334bcde5 100644 --- a/examples/server/themes/buttons-top/index.html +++ b/examples/server/themes/buttons-top/index.html @@ -222,9 +222,11 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled + penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled + tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled frequency_penalty: 0.0, // 0.0 = disabled @@ -778,6 +780,7 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} + ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} @@ -785,6 +788,7 @@
More options
+ ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html index 73f36d4b2..8361c5774 100644 --- a/examples/server/themes/wild/index.html +++ b/examples/server/themes/wild/index.html @@ -225,9 +225,11 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled + penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled + tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled frequency_penalty: 0.0, // 0.0 = disabled @@ -781,6 +783,7 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} + ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} @@ -788,6 +791,7 @@
More options
+ ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 5f97df5fd..db6b3b74d 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -1,45 +1,57 @@ #pragma once -#include "common.h" -#include "log.h" #include "llama.h" -#include "common/base64.hpp" - -// increase max payload length to allow use of larger context size -#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 -#include "httplib.h" +#include "common.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" -#include "minja.hpp" -#include "chat.hpp" -#include "chat-template.hpp" -#include -#include #include #include -#include +#include +#include -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::ordered_json; -#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error +}; -#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +extern bool server_verbose; +extern bool server_log_json; -#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#ifndef SERVER_VERBOSE +#define SERVER_VERBOSE 1 +#endif + +#if SERVER_VERBOSE != 1 +#define LOG_VERBOSE(MSG, ...) +#else +#define LOG_VERBOSE(MSG, ...) \ + do \ + { \ + if (server_verbose) \ + { \ + server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while (0) +#endif + +#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + +static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra); template static T json_value(const json & body, const std::string & key, const T & default_value) { @@ -48,7 +60,9 @@ static T json_value(const json & body, const std::string & key, const T & defaul try { return body.at(key); } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { - LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name()); + std::stringstream ss; + ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value."; + LOG_WARNING(ss.str().c_str(), body); return default_value; } } else { @@ -56,300 +70,55 @@ static T json_value(const json & body, const std::string & key, const T & defaul } } -const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); +static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) { + std::stringstream ss_tid; + ss_tid << std::this_thread::get_id(); + json log = json{ + {"tid", ss_tid.str()}, + {"timestamp", time(nullptr)}, + }; -// -// tokenizer and input processing utils -// + if (server_log_json) { + log.merge_patch({ + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); -static bool json_is_array_of_numbers(const json & data) { - if (data.is_array()) { - for (const auto & e : data) { - if (!e.is_number_integer()) { - return false; - } + if (!extra.empty()) { + log.merge_patch(extra); } - return true; - } - return false; -} -// is array having BOTH numbers & strings? -static bool json_is_array_of_mixed_numbers_strings(const json & data) { - bool seen_string = false; - bool seen_number = false; - if (data.is_array()) { - for (const auto & e : data) { - seen_string |= e.is_string(); - seen_number |= e.is_number_integer(); - if (seen_number && seen_string) { - return true; - } - } - } - return false; -} - -// get value by path(key1 / key2) -static json json_get_nested_values(const std::vector & paths, const json & js) { - json result = json::object(); - - for (const std::string & path : paths) { - json current = js; - const auto keys = string_split(path, /*separator*/ '/'); - bool valid_path = true; - for (const std::string & k : keys) { - if (valid_path && current.is_object() && current.contains(k)) { - current = current[k]; - } else { - valid_path = false; - } - } - if (valid_path) { - result[path] = current; - } - } - return result; -} - -/** - * this handles 2 cases: - * - only string, example: "string" - * - mixed string and tokens, example: [12, 34, "string", 56, 78] - */ -static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - llama_tokens prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto & p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - - llama_tokens p; - if (first) { - p = common_tokenize(vocab, s, add_special, parse_special); - first = false; - } else { - p = common_tokenize(vocab, s, false, parse_special); - } - - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - - prompt_tokens.push_back(p.template get()); - } - } + printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); } else { - auto s = json_prompt.template get(); - prompt_tokens = common_tokenize(vocab, s, add_special, parse_special); - } + char buf[1024]; + snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - return prompt_tokens; -} - -/** - * break the input "prompt" object into multiple prompt if needed, then tokenize them - * this supports these cases: - * - "prompt": "string" - * - "prompt": [12, 34, 56] - * - "prompt": [12, 34, "string", 56, 78] - * and multiple prompts (multi-tasks): - * - "prompt": ["string1", "string2"] - * - "prompt": ["string1", [12, 34, 56]] - * - "prompt": [[12, 34, 56], [78, 90, 12]] - * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] - */ -static std::vector tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { - std::vector result; - if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { - // string or mixed - result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special)); - } else if (json_is_array_of_numbers(json_prompt)) { - // array of tokens - result.push_back(json_prompt.get()); - } else if (json_prompt.is_array()) { - // array of prompts - result.reserve(json_prompt.size()); - for (const auto & p : json_prompt) { - if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { - result.push_back(tokenize_mixed(vocab, p, add_special, parse_special)); - } else if (json_is_array_of_numbers(p)) { - // array of tokens - result.push_back(p.get()); - } else { - throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens"); - } + if (!extra.empty()) { + log.merge_patch(extra); } - } else { - throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); - } - if (result.empty()) { - throw std::runtime_error("\"prompt\" must not be empty"); - } - return result; -} - -// return the last index of character that can form a valid string -// if the last character is potentially cut in half, return the index before the cut -// if validate_utf8(text) == text.size(), then the whole text is valid utf8 -static size_t validate_utf8(const std::string& text) { - size_t len = text.size(); - if (len == 0) return 0; - - // Check the last few bytes to see if a multi-byte character is cut off - for (size_t i = 1; i <= 4 && i <= len; ++i) { - unsigned char c = text[len - i]; - // Check for start of a multi-byte sequence from the end - if ((c & 0xE0) == 0xC0) { - // 2-byte character start: 110xxxxx - // Needs at least 2 bytes - if (i < 2) return len - i; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character start: 1110xxxx - // Needs at least 3 bytes - if (i < 3) return len - i; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character start: 11110xxx - // Needs at least 4 bytes - if (i < 4) return len - i; + std::stringstream ss; + ss << buf << " |"; + for (const auto & el : log.items()) + { + const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); + ss << " " << el.key() << "=" << value; } - } - // If no cut-off multi-byte character is found, return full length - return len; + const std::string str = ss.str(); + printf("%.*s\n", (int)str.size(), str.data()); + } + fflush(stdout); } // -// template utils +// chat template utils // -// format rerank task: [BOS]query[EOS][SEP]doc[EOS] -static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) { - llama_tokens result; - - result.reserve(doc.size() + query.size() + 4); - result.push_back(llama_vocab_bos(vocab)); - result.insert(result.end(), query.begin(), query.end()); - result.push_back(llama_vocab_eos(vocab)); - result.push_back(llama_vocab_sep(vocab)); - result.insert(result.end(), doc.begin(), doc.end()); - result.push_back(llama_vocab_eos(vocab)); - - return result; -} - -// format infill task -static llama_tokens format_infill( - const llama_vocab * vocab, - const json & input_prefix, - const json & input_suffix, - const json & input_extra, - const int n_batch, - const int n_predict, - const int n_ctx, - const bool spm_infill, - const llama_tokens & tokens_prompt - ) { - // TODO: optimize this block by reducing memory allocations and movement - - // use FIM repo-level pattern: - // ref: https://arxiv.org/pdf/2409.12186 - // - // [FIM_REP]myproject - // [FIM_SEP]filename0 - // extra chunk 0 - // [FIM_SEP]filename1 - // extra chunk 1 - // ... - // [FIM_SEP]filename - // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt - // - llama_tokens extra_tokens; - extra_tokens.reserve(n_ctx); - - auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false); - auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false); - - if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) { - // TODO: make project name an input - static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false); - - extra_tokens.push_back(llama_vocab_fim_rep(vocab)); - extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); - } - for (const auto & chunk : input_extra) { - // { "text": string, "filename": string } - const std::string text = json_value(chunk, "text", std::string()); - const std::string filename = json_value(chunk, "filename", std::string("tmp")); - - if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { - const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false); - - extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); - extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); - } else { - // chunk separator in binary form to avoid confusing the AI - static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; - static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false); - - extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); - } - - const auto chunk_tokens = common_tokenize(vocab, text, false, false); - extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); - } - - if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { - // TODO: current filename - static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false); - - extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); - extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); - } - - // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) - const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4)); - const int n_suffix_take = std::min(tokens_suffix.size(), std::max(0, (n_batch/4) - (2 + tokens_prompt.size()))); - - SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take)); - - // fill the rest of the context with extra chunks - const int n_extra_take = std::min(std::max(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size()); - - tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); - tokens_suffix.resize(n_suffix_take); - - tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab)); - tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); - tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab)); - - auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; - auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; - - if (llama_vocab_get_add_bos(vocab)) { - embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); - } - - SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); - - // put the extra context before the FIM prefix - embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); - - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - embd_inp.push_back(llama_vocab_fim_mid(vocab)); - - return embd_inp; -} - // Format given chat. If tmpl is empty, we take the template from model metadata -inline std::string format_chat(const common_chat_template & tmpl, const std::vector & messages) { - std::vector chat; +inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { + std::vector chat; for (size_t i = 0; i < messages.size(); ++i) { const auto & curr_msg = messages[i]; @@ -373,12 +142,11 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); } - chat.push_back({role, content, /* tool_calls= */ {}}); + chat.push_back({role, content}); } - const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false); - LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); - + auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); + LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); return formatted_chat; } @@ -467,13 +235,30 @@ static std::string random_string() { } static std::string gen_chatcmplid() { - return "chatcmpl-" + random_string(); + std::stringstream chatcmplid; + chatcmplid << "chatcmpl-" << random_string(); + + return chatcmplid.str(); } // // other common utils // +static size_t common_part(const std::vector & a, const std::vector & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + + return i; +} + +static size_t common_part(const std::string & a, const std::string & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + + return i; +} + static bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } @@ -499,7 +284,7 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += common_token_to_piece(ctx, *begin); + ret += llama_token_to_piece(ctx, *begin); } return ret; @@ -507,7 +292,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token); + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) @@ -521,86 +306,75 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx, return out; } -static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) { - const std::string str = - std::string(event) + ": " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). +struct completion_token_output { + llama_token tok; + std::string text_to_send; - LOG_DBG("data stream, to_send: %s", str.c_str()); + struct token_prob { + llama_token tok; + float prob; + }; - return sink.write(str.c_str(), str.size()); + std::vector probs; +}; + +// convert a vector of completion_token_output to json +static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { + json out = json::array(); + + for (const auto & prob : probs) { + json probs_for_token = json::array(); + + for (const auto & p : prob.probs) { + const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json { + {"tok_str", tok_str}, + {"prob", p.prob}, + }); + } + + const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json { + {"content", tok_str}, + {"probs", probs_for_token}, + }); + } + + return out; } // // OAI utils // -static json oaicompat_completion_params_parse(const json & body) { - json llama_params; - - if (!body.contains("prompt")) { - throw std::runtime_error("\"prompt\" is required"); - } - - // Handle "stop" field - if (body.contains("stop") && body.at("stop").is_string()) { - llama_params["stop"] = json::array({body.at("stop").get()}); - } else { - llama_params["stop"] = json_value(body, "stop", json::array()); - } - - // Handle "n" field - int n_choices = json_value(body, "n", 1); - if (n_choices != 1) { - throw std::runtime_error("Only one completion choice is allowed"); - } - - // Params supported by OAI but unsupported by llama.cpp - static const std::vector unsupported_params { "best_of", "echo", "suffix" }; - for (const auto & param : unsupported_params) { - if (body.contains(param)) { - throw std::runtime_error("Unsupported param: " + param); - } - } - - // Copy remaining properties to llama_params - for (const auto & item : body.items()) { - // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" - if (!llama_params.contains(item.key()) || item.key() == "n_predict") { - llama_params[item.key()] = item.value(); - } - } - - return llama_params; -} - static json oaicompat_completion_params_parse( + const struct llama_model * model, const json & body, /* openai api json semantics */ - bool use_jinja, - const common_chat_templates & chat_templates) -{ + const std::string & chat_template) { json llama_params; - const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use - ? *chat_templates.template_tool_use - : *chat_templates.template_default; - auto tools = json_value(body, "tools", json()); - auto stream = json_value(body, "stream", false); + llama_params["__oaicompat"] = true; - if (tools.is_array() && !tools.empty()) { - if (stream) { - throw std::runtime_error("Cannot use tools with stream"); - } - if (!use_jinja) { - throw std::runtime_error("tools param requires --jinja flag"); - } - } - if (!use_jinja) { - if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) { - throw std::runtime_error("Unsupported param: tool_choice"); - } - } + // Map OpenAI parameters to llama.cpp parameters + // + // For parameters that are defined by the OpenAI documentation (e.g. + // temperature), we explicitly specify OpenAI's intended default; we + // need to do that because sometimes OpenAI disagrees with llama.cpp + // + // https://platform.openai.com/docs/api-reference/chat/create + llama_sampling_params default_sparams; + llama_params["model"] = json_value(body, "model", std::string("unknown")); + llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); + llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); + llama_params["n_predict"] = json_value(body, "max_tokens", -1); + llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); + llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); + llama_params["stream"] = json_value(body, "stream", false); + llama_params["temperature"] = json_value(body, "temperature", 1.0); + llama_params["top_p"] = json_value(body, "top_p", 1.0); + + // Apply chat template to the list of messages + llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); // Handle "stop" field if (body.contains("stop") && body.at("stop").is_string()) { @@ -615,57 +389,11 @@ static json oaicompat_completion_params_parse( std::string response_type = json_value(response_format, "type", std::string()); if (response_type == "json_object") { llama_params["json_schema"] = json_value(response_format, "schema", json::object()); - } else if (response_type == "json_schema") { - json json_schema = json_value(response_format, "json_schema", json::object()); - llama_params["json_schema"] = json_value(json_schema, "schema", json::object()); } else if (!response_type.empty() && response_type != "text") { throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); } } - // Apply chat template to the list of messages - if (use_jinja) { - auto tool_choice = json_value(body, "tool_choice", std::string("auto")); - if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") { - throw std::runtime_error("Invalid tool_choice: " + tool_choice); - } - if (tool_choice != "none" && llama_params.contains("grammar")) { - throw std::runtime_error("Cannot use custom grammar constraints with tools."); - } - common_chat_inputs inputs; - inputs.messages = body.at("messages"); - inputs.tools = tools; - inputs.tool_choice = tool_choice; - inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); - if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { - LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n"); - inputs.parallel_tool_calls = false; - } - inputs.stream = stream; - // TODO: support mixing schema w/ tools beyond generic format. - inputs.json_schema = json_value(llama_params, "json_schema", json()); - auto chat_params = common_chat_params_init(tmpl, inputs); - - llama_params["chat_format"] = static_cast(chat_params.format); - llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; - llama_params["grammar_lazy"] = chat_params.grammar_lazy; - auto grammar_triggers = json::array(); - for (const auto & trigger : chat_params.grammar_triggers) { - grammar_triggers.push_back({ - {"word", trigger.word}, - {"at_start", trigger.at_start}, - }); - } - llama_params["grammar_triggers"] = grammar_triggers; - llama_params["preserved_tokens"] = chat_params.preserved_tokens; - for (const auto & stop : chat_params.additional_stops) { - llama_params["stop"].push_back(stop); - } - } else { - llama_params["prompt"] = format_chat(tmpl, body.at("messages")); - } - // Handle "n" field int n_choices = json_value(body, "n", 1); if (n_choices != 1) { @@ -674,14 +402,22 @@ static json oaicompat_completion_params_parse( // Handle "logprobs" field // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future - if (json_value(body, "logprobs", false)) { + if (body.contains("logprobs")) { llama_params["n_probs"] = json_value(body, "top_logprobs", 20); - } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) { + } else if (body.contains("top_logprobs")) { throw std::runtime_error("top_logprobs requires logprobs to be set to true"); } + // Params supported by OAI but unsupported by llama.cpp + static const std::vector unsupported_params { "tools", "tool_choice" }; + for (auto & param : unsupported_params) { + if (body.contains(param)) { + throw std::runtime_error("Unsupported param: " + param); + } + } + // Copy remaining properties to llama_params - // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint. + // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint. // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp for (const auto & item : body.items()) { // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" @@ -693,41 +429,171 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) { - json data = json::array(); - int32_t n_tokens = 0; - int i = 0; - for (const auto & elem : embeddings) { - json embedding_obj; +static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); - if (use_base64) { - const auto& vec = json_value(elem, "embedding", json::array()).get>(); - const char* data_ptr = reinterpret_cast(vec.data()); - size_t data_size = vec.size() * sizeof(float); - embedding_obj = { - {"embedding", base64::encode(data_ptr, data_size)}, - {"index", i++}, - {"object", "embedding"}, - {"encoding_format", "base64"} - }; + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = json { + {"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }}, + {"id", completion_id} + }; + + if (server_verbose) { + res["__verbose"] = result; + } + + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + } + + return res; +} + +// return value is vector as there is one case where we might need to generate two responses +static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector({result}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", completion_id}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", completion_id}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } } else { - embedding_obj = { - {"embedding", json_value(elem, "embedding", json::array())}, - {"index", i++}, - {"object", "embedding"} - }; - } - data.push_back(embedding_obj); + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } - n_tokens += json_value(elem, "tokens_evaluated", 0); + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", completion_id}, + {"model", modelname}, + {"object", "chat.completion.chunk"} + }; + if (!finish_reason.empty()) { + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + ret.push_back({"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }}); + } + + return std::vector({ret}); +} + +static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { + json data = json::array(); + int i = 0; + for (auto & elem : embeddings) { + data.push_back(json{ + {"embedding", json_value(elem, "embedding", json::array())}, + {"index", i++}, + {"object", "embedding"} + }); } json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, {"usage", json { - {"prompt_tokens", n_tokens}, - {"total_tokens", n_tokens} + {"prompt_tokens", 0}, + {"total_tokens", 0} }}, {"data", data} }; @@ -735,66 +601,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_response_rerank(const json & request, const json & ranks) { - json data = json::array(); - int32_t n_tokens = 0; - int i = 0; - for (const auto & rank : ranks) { - data.push_back(json{ - {"index", i++}, - {"relevance_score", json_value(rank, "score", 0.0)}, - }); - - n_tokens += json_value(rank, "tokens_evaluated", 0); - } - - json res = json { - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", "list"}, - {"usage", json { - {"prompt_tokens", n_tokens}, - {"total_tokens", n_tokens} - }}, - {"results", data} - }; - - return res; -} - -static bool is_valid_utf8(const std::string & str) { - const unsigned char* bytes = reinterpret_cast(str.data()); - const unsigned char* end = bytes + str.length(); - - while (bytes < end) { - if (*bytes <= 0x7F) { - // 1-byte sequence (0xxxxxxx) - bytes++; - } else if ((*bytes & 0xE0) == 0xC0) { - // 2-byte sequence (110xxxxx 10xxxxxx) - if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) - return false; - bytes += 2; - } else if ((*bytes & 0xF0) == 0xE0) { - // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) - if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) - return false; - bytes += 3; - } else if ((*bytes & 0xF8) == 0xF0) { - // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) - if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || - (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) - return false; - bytes += 4; - } else { - // Invalid UTF-8 lead byte - return false; - } - } - - return true; -} - -static json format_tokenizer_response(const json & tokens) { +static json format_tokenizer_response(const std::vector & tokens) { return json { {"tokens", tokens} }; @@ -806,92 +613,42 @@ static json format_detokenized_response(const std::string & content) { }; } -static json format_logit_bias(const std::vector & logit_bias) { - json data = json::array(); - for (const auto & lb : logit_bias) { - data.push_back(json{ - {"bias", lb.bias}, - {"token", lb.token}, - }); +static json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; } - return data; -} - -static std::string safe_json_to_str(const json & data) { - return data.dump(-1, ' ', false, json::error_handler_t::replace); -} - -static std::vector get_token_probabilities(llama_context * ctx, int idx) { - std::vector cur; - const auto * logits = llama_get_logits_ith(ctx, idx); - - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_vocab = llama_vocab_n_tokens(vocab); - - cur.resize(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; - } - - // sort tokens by logits - std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }); - - // apply softmax - float max_l = cur[0].logit; - float cum_sum = 0.0f; - for (size_t i = 0; i < cur.size(); ++i) { - float p = expf(cur[i].logit - max_l); - cur[i].p = p; - cum_sum += p; - } - for (size_t i = 0; i < cur.size(); ++i) { - cur[i].p /= cum_sum; - } - - return cur; -} - -static bool are_lora_equal( - const std::vector & l1, - const std::vector & l2) { - if (l1.size() != l2.size()) { - return false; - } - for (size_t i = 0; i < l1.size(); ++i) { - // we don't check lora.path to reduce the time complexity - if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) { - return false; - } - } - return true; -} - -// parse lora config from JSON request, returned a copy of lora_base with updated scale -static std::vector parse_lora_request( - const std::vector & lora_base, - const json & data) { - std::vector lora(lora_base); - int max_idx = lora.size(); - - // clear existing value - for (auto & entry : lora) { - entry.scale = 0.0f; - } - - // set value - for (const auto & entry : data) { - int id = json_value(entry, "id", -1); - float scale = json_value(entry, "scale", 0.0f); - if (0 <= id && id < max_idx) { - lora[id].scale = scale; - } else { - throw std::runtime_error("invalid adapter id"); - } - } - - return lora; + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; } diff --git a/examples/server/webui/.gitignore b/examples/server/webui/.gitignore deleted file mode 100644 index a547bf36d..000000000 --- a/examples/server/webui/.gitignore +++ /dev/null @@ -1,24 +0,0 @@ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -pnpm-debug.log* -lerna-debug.log* - -node_modules -dist -dist-ssr -*.local - -# Editor directories and files -.vscode/* -!.vscode/extensions.json -.idea -.DS_Store -*.suo -*.ntvs* -*.njsproj -*.sln -*.sw? diff --git a/examples/server/webui/.prettierignore b/examples/server/webui/.prettierignore deleted file mode 100644 index c0cb165b3..000000000 --- a/examples/server/webui/.prettierignore +++ /dev/null @@ -1,10 +0,0 @@ -**/.vscode -**/.github -**/.git -**/.svn -**/.hg -**/node_modules -**/dist -**/build - -*.config.js diff --git a/examples/server/webui/eslint.config.js b/examples/server/webui/eslint.config.js deleted file mode 100644 index 7c0d39b89..000000000 --- a/examples/server/webui/eslint.config.js +++ /dev/null @@ -1,26 +0,0 @@ -import js from '@eslint/js' -import globals from 'globals' -import reactHooks from 'eslint-plugin-react-hooks' -import reactRefresh from 'eslint-plugin-react-refresh' -import tseslint from 'typescript-eslint' - -export default tseslint.config( - { ignores: ['dist'] }, - { - extends: [js.configs.recommended, ...tseslint.configs.recommended], - files: ['**/*.{ts,tsx}'], - languageOptions: { - ecmaVersion: 2020, - globals: globals.browser, - }, - plugins: { - 'react-hooks': reactHooks, - 'react-refresh': reactRefresh, - }, - rules: { - ...reactHooks.configs.recommended.rules, - 'react-refresh/only-export-components': 'off', - '@typescript-eslint/no-unused-vars': 'off', - }, - }, -) diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html deleted file mode 100644 index 471f46b3a..000000000 --- a/examples/server/webui/index.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - 🦙 llama.cpp - chat - - -
- - - diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json deleted file mode 100644 index c6c5de3c0..000000000 --- a/examples/server/webui/package-lock.json +++ /dev/null @@ -1,6608 +0,0 @@ -{ - "name": "webui", - "version": "0.0.0", - "lockfileVersion": 3, - "requires": true, - "packages": { - "": { - "name": "webui", - "version": "0.0.0", - "dependencies": { - "@heroicons/react": "^2.2.0", - "@sec-ant/readable-stream": "^0.6.0", - "@vscode/markdown-it-katex": "^1.1.1", - "autoprefixer": "^10.4.20", - "daisyui": "^4.12.14", - "highlight.js": "^11.10.0", - "katex": "^0.16.15", - "postcss": "^8.4.49", - "react": "^18.3.1", - "react-dom": "^18.3.1", - "react-markdown": "^9.0.3", - "react-router": "^7.1.5", - "rehype-highlight": "^7.0.2", - "rehype-katex": "^7.0.1", - "remark-breaks": "^4.0.0", - "remark-gfm": "^4.0.0", - "remark-math": "^6.0.0", - "tailwindcss": "^3.4.15", - "textlinestream": "^1.1.1", - "vite-plugin-singlefile": "^2.0.3" - }, - "devDependencies": { - "@eslint/js": "^9.17.0", - "@types/markdown-it": "^14.1.2", - "@types/node": "^22.13.1", - "@types/react": "^18.3.18", - "@types/react-dom": "^18.3.5", - "@vitejs/plugin-react": "^4.3.4", - "eslint": "^9.17.0", - "eslint-plugin-react-hooks": "^5.0.0", - "eslint-plugin-react-refresh": "^0.4.16", - "globals": "^15.14.0", - "prettier": "^3.4.2", - "sass-embedded": "^1.83.4", - "typescript": "~5.6.2", - "typescript-eslint": "^8.18.2", - "vite": "^6.0.5" - } - }, - "node_modules/@alloc/quick-lru": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz", - "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==", - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/@ampproject/remapping": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", - "integrity": "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@jridgewell/gen-mapping": "^0.3.5", - "@jridgewell/trace-mapping": "^0.3.24" - }, - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@babel/code-frame": { - "version": "7.26.2", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.26.2.tgz", - "integrity": "sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/helper-validator-identifier": "^7.25.9", - "js-tokens": "^4.0.0", - "picocolors": "^1.0.0" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/compat-data": { - "version": "7.26.5", - "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.26.5.tgz", - "integrity": "sha512-XvcZi1KWf88RVbF9wn8MN6tYFloU5qX8KjuF3E1PVBmJ9eypXfs4GRiJwLuTZL0iSnJUKn1BFPa5BPZZJyFzPg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/core": { - "version": "7.26.7", - "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.26.7.tgz", - "integrity": "sha512-SRijHmF0PSPgLIBYlWnG0hyeJLwXE2CgpsXaMOrtt2yp9/86ALw6oUlj9KYuZ0JN07T4eBMVIW4li/9S1j2BGA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@ampproject/remapping": "^2.2.0", - "@babel/code-frame": "^7.26.2", - "@babel/generator": "^7.26.5", - "@babel/helper-compilation-targets": "^7.26.5", - "@babel/helper-module-transforms": "^7.26.0", - "@babel/helpers": "^7.26.7", - "@babel/parser": "^7.26.7", - "@babel/template": "^7.25.9", - "@babel/traverse": "^7.26.7", - "@babel/types": "^7.26.7", - "convert-source-map": "^2.0.0", - "debug": "^4.1.0", - "gensync": "^1.0.0-beta.2", - "json5": "^2.2.3", - "semver": "^6.3.1" - }, - "engines": { - "node": ">=6.9.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/babel" - } - }, - "node_modules/@babel/generator": { - "version": "7.26.5", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.26.5.tgz", - "integrity": "sha512-2caSP6fN9I7HOe6nqhtft7V4g7/V/gfDsC3Ag4W7kEzzvRGKqiv0pu0HogPiZ3KaVSoNDhUws6IJjDjpfmYIXw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/parser": "^7.26.5", - "@babel/types": "^7.26.5", - "@jridgewell/gen-mapping": "^0.3.5", - "@jridgewell/trace-mapping": "^0.3.25", - "jsesc": "^3.0.2" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-compilation-targets": { - "version": "7.26.5", - "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.26.5.tgz", - "integrity": "sha512-IXuyn5EkouFJscIDuFF5EsiSolseme1s0CZB+QxVugqJLYmKdxI1VfIBOst0SUu4rnk2Z7kqTwmoO1lp3HIfnA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/compat-data": "^7.26.5", - "@babel/helper-validator-option": "^7.25.9", - "browserslist": "^4.24.0", - "lru-cache": "^5.1.1", - "semver": "^6.3.1" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-module-imports": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.25.9.tgz", - "integrity": "sha512-tnUA4RsrmflIM6W6RFTLFSXITtl0wKjgpnLgXyowocVPrbYrLUXSBXDgTs8BlbmIzIdlBySRQjINYs2BAkiLtw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/traverse": "^7.25.9", - "@babel/types": "^7.25.9" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-module-transforms": { - "version": "7.26.0", - "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.26.0.tgz", - "integrity": "sha512-xO+xu6B5K2czEnQye6BHA7DolFFmS3LB7stHZFaOLb1pAwO1HWLS8fXA+eh0A2yIvltPVmx3eNNDBJA2SLHXFw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/helper-module-imports": "^7.25.9", - "@babel/helper-validator-identifier": "^7.25.9", - "@babel/traverse": "^7.25.9" - }, - "engines": { - "node": ">=6.9.0" - }, - "peerDependencies": { - "@babel/core": "^7.0.0" - } - }, - "node_modules/@babel/helper-plugin-utils": { - "version": "7.26.5", - "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.26.5.tgz", - "integrity": "sha512-RS+jZcRdZdRFzMyr+wcsaqOmld1/EqTghfaBGQQd/WnRdzdlvSZ//kF7U8VQTxf1ynZ4cjUcYgjVGx13ewNPMg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-string-parser": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz", - "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-validator-identifier": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz", - "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-validator-option": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.25.9.tgz", - "integrity": "sha512-e/zv1co8pp55dNdEcCynfj9X7nyUKUXoUEwfXqaZt0omVOmDe9oOTdKStH4GmAw6zxMFs50ZayuMfHDKlO7Tfw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helpers": { - "version": "7.26.7", - "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.7.tgz", - "integrity": "sha512-8NHiL98vsi0mbPQmYAGWwfcFaOy4j2HY49fXJCfuDcdE7fMIsH9a7GdaeXpIBsbT7307WU8KCMp5pUVDNL4f9A==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/template": "^7.25.9", - "@babel/types": "^7.26.7" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/parser": { - "version": "7.26.7", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.7.tgz", - "integrity": "sha512-kEvgGGgEjRUutvdVvZhbn/BxVt+5VSpwXz1j3WYXQbXDo8KzFOPNG2GQbdAiNq8g6wn1yKk7C/qrke03a84V+w==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/types": "^7.26.7" - }, - "bin": { - "parser": "bin/babel-parser.js" - }, - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@babel/plugin-transform-react-jsx-self": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.25.9.tgz", - "integrity": "sha512-y8quW6p0WHkEhmErnfe58r7x0A70uKphQm8Sp8cV7tjNQwK56sNVK0M73LK3WuYmsuyrftut4xAkjjgU0twaMg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/helper-plugin-utils": "^7.25.9" - }, - "engines": { - "node": ">=6.9.0" - }, - "peerDependencies": { - "@babel/core": "^7.0.0-0" - } - }, - "node_modules/@babel/plugin-transform-react-jsx-source": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.25.9.tgz", - "integrity": "sha512-+iqjT8xmXhhYv4/uiYd8FNQsraMFZIfxVSqxxVSZP0WbbSAWvBXAul0m/zu+7Vv4O/3WtApy9pmaTMiumEZgfg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/helper-plugin-utils": "^7.25.9" - }, - "engines": { - "node": ">=6.9.0" - }, - "peerDependencies": { - "@babel/core": "^7.0.0-0" - } - }, - "node_modules/@babel/template": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz", - "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/code-frame": "^7.25.9", - "@babel/parser": "^7.25.9", - "@babel/types": "^7.25.9" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/traverse": { - "version": "7.26.7", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.26.7.tgz", - "integrity": "sha512-1x1sgeyRLC3r5fQOM0/xtQKsYjyxmFjaOrLJNtZ81inNjyJHGIolTULPiSc/2qe1/qfpFLisLQYFnnZl7QoedA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/code-frame": "^7.26.2", - "@babel/generator": "^7.26.5", - "@babel/parser": "^7.26.7", - "@babel/template": "^7.25.9", - "@babel/types": "^7.26.7", - "debug": "^4.3.1", - "globals": "^11.1.0" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/traverse/node_modules/globals": { - "version": "11.12.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz", - "integrity": "sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=4" - } - }, - "node_modules/@babel/types": { - "version": "7.26.7", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.7.tgz", - "integrity": "sha512-t8kDRGrKXyp6+tjUh7hw2RLyclsW4TRoRvRHtSyAX9Bb5ldlFh+90YAYY6awRXrlB4G5G2izNeGySpATlFzmOg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/helper-string-parser": "^7.25.9", - "@babel/helper-validator-identifier": "^7.25.9" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@bufbuild/protobuf": { - "version": "2.2.3", - "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz", - "integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==", - "devOptional": true, - "license": "(Apache-2.0 AND BSD-3-Clause)" - }, - "node_modules/@esbuild/aix-ppc64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.24.2.tgz", - "integrity": "sha512-thpVCb/rhxE/BnMLQ7GReQLLN8q9qbHmI55F4489/ByVg2aQaQ6kbcLb6FHkocZzQhxc4gx0sCk0tJkKBFzDhA==", - "cpu": [ - "ppc64" - ], - "license": "MIT", - "optional": true, - "os": [ - "aix" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.24.2.tgz", - "integrity": "sha512-tmwl4hJkCfNHwFB3nBa8z1Uy3ypZpxqxfTQOcHX+xRByyYgunVbZ9MzUUfb0RxaHIMnbHagwAxuTL+tnNM+1/Q==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.24.2.tgz", - "integrity": "sha512-cNLgeqCqV8WxfcTIOeL4OAtSmL8JjcN6m09XIgro1Wi7cF4t/THaWEa7eL5CMoMBdjoHOTh/vwTO/o2TRXIyzg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.24.2.tgz", - "integrity": "sha512-B6Q0YQDqMx9D7rvIcsXfmJfvUYLoP722bgfBlO5cGvNVb5V/+Y7nhBE3mHV9OpxBf4eAS2S68KZztiPaWq4XYw==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/darwin-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.24.2.tgz", - "integrity": "sha512-kj3AnYWc+CekmZnS5IPu9D+HWtUI49hbnyqk0FLEJDbzCIQt7hg7ucF1SQAilhtYpIujfaHr6O0UHlzzSPdOeA==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/darwin-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.24.2.tgz", - "integrity": "sha512-WeSrmwwHaPkNR5H3yYfowhZcbriGqooyu3zI/3GGpF8AyUdsrrP0X6KumITGA9WOyiJavnGZUwPGvxvwfWPHIA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.24.2.tgz", - "integrity": "sha512-UN8HXjtJ0k/Mj6a9+5u6+2eZ2ERD7Edt1Q9IZiB5UZAIdPnVKDoG7mdTVGhHJIeEml60JteamR3qhsr1r8gXvg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.24.2.tgz", - "integrity": "sha512-TvW7wE/89PYW+IevEJXZ5sF6gJRDY/14hyIGFXdIucxCsbRmLUcjseQu1SyTko+2idmCw94TgyaEZi9HUSOe3Q==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.24.2.tgz", - "integrity": "sha512-n0WRM/gWIdU29J57hJyUdIsk0WarGd6To0s+Y+LwvlC55wt+GT/OgkwoXCXvIue1i1sSNWblHEig00GBWiJgfA==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.24.2.tgz", - "integrity": "sha512-7HnAD6074BW43YvvUmE/35Id9/NB7BeX5EoNkK9obndmZBUk8xmJJeU7DwmUeN7tkysslb2eSl6CTrYz6oEMQg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ia32": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.24.2.tgz", - "integrity": "sha512-sfv0tGPQhcZOgTKO3oBE9xpHuUqguHvSo4jl+wjnKwFpapx+vUDcawbwPNuBIAYdRAvIDBfZVvXprIj3HA+Ugw==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-loong64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.24.2.tgz", - "integrity": "sha512-CN9AZr8kEndGooS35ntToZLTQLHEjtVB5n7dl8ZcTZMonJ7CCfStrYhrzF97eAecqVbVJ7APOEe18RPI4KLhwQ==", - "cpu": [ - "loong64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-mips64el": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.24.2.tgz", - "integrity": "sha512-iMkk7qr/wl3exJATwkISxI7kTcmHKE+BlymIAbHO8xanq/TjHaaVThFF6ipWzPHryoFsesNQJPE/3wFJw4+huw==", - "cpu": [ - "mips64el" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ppc64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.24.2.tgz", - "integrity": "sha512-shsVrgCZ57Vr2L8mm39kO5PPIb+843FStGt7sGGoqiiWYconSxwTiuswC1VJZLCjNiMLAMh34jg4VSEQb+iEbw==", - "cpu": [ - "ppc64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-riscv64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.24.2.tgz", - "integrity": "sha512-4eSFWnU9Hhd68fW16GD0TINewo1L6dRrB+oLNNbYyMUAeOD2yCK5KXGK1GH4qD/kT+bTEXjsyTCiJGHPZ3eM9Q==", - "cpu": [ - "riscv64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-s390x": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.24.2.tgz", - "integrity": "sha512-S0Bh0A53b0YHL2XEXC20bHLuGMOhFDO6GN4b3YjRLK//Ep3ql3erpNcPlEFed93hsQAjAQDNsvcK+hV90FubSw==", - "cpu": [ - "s390x" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.24.2.tgz", - "integrity": "sha512-8Qi4nQcCTbLnK9WoMjdC9NiTG6/E38RNICU6sUNqK0QFxCYgoARqVqxdFmWkdonVsvGqWhmm7MO0jyTqLqwj0Q==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.24.2.tgz", - "integrity": "sha512-wuLK/VztRRpMt9zyHSazyCVdCXlpHkKm34WUyinD2lzK07FAHTq0KQvZZlXikNWkDGoT6x3TD51jKQ7gMVpopw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.24.2.tgz", - "integrity": "sha512-VefFaQUc4FMmJuAxmIHgUmfNiLXY438XrL4GDNV1Y1H/RW3qow68xTwjZKfj/+Plp9NANmzbH5R40Meudu8mmw==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.24.2.tgz", - "integrity": "sha512-YQbi46SBct6iKnszhSvdluqDmxCJA+Pu280Av9WICNwQmMxV7nLRHZfjQzwbPs3jeWnuAhE9Jy0NrnJ12Oz+0A==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.24.2.tgz", - "integrity": "sha512-+iDS6zpNM6EnJyWv0bMGLWSWeXGN/HTaF/LXHXHwejGsVi+ooqDfMCCTerNFxEkM3wYVcExkeGXNqshc9iMaOA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/sunos-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.24.2.tgz", - "integrity": "sha512-hTdsW27jcktEvpwNHJU4ZwWFGkz2zRJUz8pvddmXPtXDzVKTTINmlmga3ZzwcuMpUvLw7JkLy9QLKyGpD2Yxig==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "sunos" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-arm64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.24.2.tgz", - "integrity": "sha512-LihEQ2BBKVFLOC9ZItT9iFprsE9tqjDjnbulhHoFxYQtQfai7qfluVODIYxt1PgdoyQkz23+01rzwNwYfutxUQ==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-ia32": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.24.2.tgz", - "integrity": "sha512-q+iGUwfs8tncmFC9pcnD5IvRHAzmbwQ3GPS5/ceCyHdjXubwQWI12MKWSNSMYLJMq23/IUCvJMS76PDqXe1fxA==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-x64": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.24.2.tgz", - "integrity": "sha512-7VTgWzgMGvup6aSqDPLiW5zHaxYJGTO4OokMjIlrCtf+VpEL+cXKtCvg723iguPYI5oaUNdS+/V7OU2gvXVWEg==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@eslint-community/eslint-utils": { - "version": "4.4.1", - "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.1.tgz", - "integrity": "sha512-s3O3waFUrMV8P/XaF/+ZTp1X9XBZW1a4B97ZnjQF2KYWaFD2A8KyFBsrsfSjEmjn3RGWAIuvlneuZm3CUK3jbA==", - "dev": true, - "license": "MIT", - "dependencies": { - "eslint-visitor-keys": "^3.4.3" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - }, - "peerDependencies": { - "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" - } - }, - "node_modules/@eslint-community/eslint-utils/node_modules/eslint-visitor-keys": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", - "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/@eslint-community/regexpp": { - "version": "4.12.1", - "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.1.tgz", - "integrity": "sha512-CCZCDJuduB9OUkFkY2IgppNZMi2lBQgD2qzwXkEia16cge2pijY/aXi96CJMquDMn3nJdlPV1A5KrJEXwfLNzQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "^12.0.0 || ^14.0.0 || >=16.0.0" - } - }, - "node_modules/@eslint/config-array": { - "version": "0.19.2", - "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.19.2.tgz", - "integrity": "sha512-GNKqxfHG2ySmJOBSHg7LxeUx4xpuCoFjacmlCoYWEbaPXLwvfIjixRI12xCQZeULksQb23uiA8F40w5TojpV7w==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@eslint/object-schema": "^2.1.6", - "debug": "^4.3.1", - "minimatch": "^3.1.2" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@eslint/core": { - "version": "0.10.0", - "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.10.0.tgz", - "integrity": "sha512-gFHJ+xBOo4G3WRlR1e/3G8A6/KZAH6zcE/hkLRCZTi/B9avAG365QhFA8uOGzTMqgTghpn7/fSnscW++dpMSAw==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@types/json-schema": "^7.0.15" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@eslint/eslintrc": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.2.0.tgz", - "integrity": "sha512-grOjVNN8P3hjJn/eIETF1wwd12DdnwFDoyceUJLYYdkpbwq3nLi+4fqrTAONx7XDALqlL220wC/RHSC/QTI/0w==", - "dev": true, - "license": "MIT", - "dependencies": { - "ajv": "^6.12.4", - "debug": "^4.3.2", - "espree": "^10.0.1", - "globals": "^14.0.0", - "ignore": "^5.2.0", - "import-fresh": "^3.2.1", - "js-yaml": "^4.1.0", - "minimatch": "^3.1.2", - "strip-json-comments": "^3.1.1" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/@eslint/eslintrc/node_modules/globals": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz", - "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/@eslint/js": { - "version": "9.19.0", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.19.0.tgz", - "integrity": "sha512-rbq9/g38qjfqFLOVPvwjIvFFdNziEC5S65jmjPw5r6A//QH+W91akh9irMwjDN8zKUTak6W9EsAv4m/7Wnw0UQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@eslint/object-schema": { - "version": "2.1.6", - "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.6.tgz", - "integrity": "sha512-RBMg5FRL0I0gs51M/guSAj5/e14VQ4tpZnQNWwuDT66P14I43ItmPfIZRhO9fUVIPOAQXU47atlywZ/czoqFPA==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@eslint/plugin-kit": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.2.5.tgz", - "integrity": "sha512-lB05FkqEdUg2AA0xEbUz0SnkXT1LcCTa438W4IWTUh4hdOnVbQyOJ81OrDXsJk/LSiJHubgGEFoR5EHq1NsH1A==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@eslint/core": "^0.10.0", - "levn": "^0.4.1" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@heroicons/react": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@heroicons/react/-/react-2.2.0.tgz", - "integrity": "sha512-LMcepvRaS9LYHJGsF0zzmgKCUim/X3N/DQKc4jepAXJ7l8QxJ1PmxJzqplF2Z3FE4PqBAIGyJAQ/w4B5dsqbtQ==", - "license": "MIT", - "peerDependencies": { - "react": ">= 16 || ^19.0.0-rc" - } - }, - "node_modules/@humanfs/core": { - "version": "0.19.1", - "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", - "integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=18.18.0" - } - }, - "node_modules/@humanfs/node": { - "version": "0.16.6", - "resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.6.tgz", - "integrity": "sha512-YuI2ZHQL78Q5HbhDiBA1X4LmYdXCKCMQIfw0pw7piHJwyREFebJUvrQN4cMssyES6x+vfUbx1CIpaQUKYdQZOw==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@humanfs/core": "^0.19.1", - "@humanwhocodes/retry": "^0.3.0" - }, - "engines": { - "node": ">=18.18.0" - } - }, - "node_modules/@humanfs/node/node_modules/@humanwhocodes/retry": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.3.1.tgz", - "integrity": "sha512-JBxkERygn7Bv/GbN5Rv8Ul6LVknS+5Bp6RgDC/O8gEBU/yeH5Ui5C/OlWrTb6qct7LjjfT6Re2NxB0ln0yYybA==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=18.18" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" - } - }, - "node_modules/@humanwhocodes/module-importer": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz", - "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=12.22" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" - } - }, - "node_modules/@humanwhocodes/retry": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.1.tgz", - "integrity": "sha512-c7hNEllBlenFTHBky65mhq8WD2kbN9Q6gk0bTk8lSBvc554jpXSkST1iePudpt7+A/AQvuHs9EMqjHDXMY1lrA==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=18.18" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" - } - }, - "node_modules/@isaacs/cliui": { - "version": "8.0.2", - "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", - "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", - "license": "ISC", - "dependencies": { - "string-width": "^5.1.2", - "string-width-cjs": "npm:string-width@^4.2.0", - "strip-ansi": "^7.0.1", - "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", - "wrap-ansi": "^8.1.0", - "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/@jridgewell/gen-mapping": { - "version": "0.3.8", - "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.8.tgz", - "integrity": "sha512-imAbBGkb+ebQyxKgzv5Hu2nmROxoDOXHh80evxdoXNOrvAnVx7zimzc1Oo5h9RlfV4vPXaE2iM5pOFbvOCClWA==", - "license": "MIT", - "dependencies": { - "@jridgewell/set-array": "^1.2.1", - "@jridgewell/sourcemap-codec": "^1.4.10", - "@jridgewell/trace-mapping": "^0.3.24" - }, - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@jridgewell/resolve-uri": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", - "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", - "license": "MIT", - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@jridgewell/set-array": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz", - "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==", - "license": "MIT", - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@jridgewell/sourcemap-codec": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", - "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", - "license": "MIT" - }, - "node_modules/@jridgewell/trace-mapping": { - "version": "0.3.25", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz", - "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==", - "license": "MIT", - "dependencies": { - "@jridgewell/resolve-uri": "^3.1.0", - "@jridgewell/sourcemap-codec": "^1.4.14" - } - }, - "node_modules/@nodelib/fs.scandir": { - "version": "2.1.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", - "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", - "license": "MIT", - "dependencies": { - "@nodelib/fs.stat": "2.0.5", - "run-parallel": "^1.1.9" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.stat": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", - "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.walk": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", - "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", - "license": "MIT", - "dependencies": { - "@nodelib/fs.scandir": "2.1.5", - "fastq": "^1.6.0" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/@pkgjs/parseargs": { - "version": "0.11.0", - "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", - "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", - "license": "MIT", - "optional": true, - "engines": { - "node": ">=14" - } - }, - "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.34.2.tgz", - "integrity": "sha512-6Fyg9yQbwJR+ykVdT9sid1oc2ewejS6h4wzQltmJfSW53N60G/ah9pngXGANdy9/aaE/TcUFpWosdm7JXS1WTQ==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ] - }, - "node_modules/@rollup/rollup-android-arm64": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.34.2.tgz", - "integrity": "sha512-K5GfWe+vtQ3kyEbihrimM38UgX57UqHp+oME7X/EX9Im6suwZfa7Hsr8AtzbJvukTpwMGs+4s29YMSO3rwWtsw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ] - }, - "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.34.2.tgz", - "integrity": "sha512-PSN58XG/V/tzqDb9kDGutUruycgylMlUE59f40ny6QIRNsTEIZsrNQTJKUN2keMMSmlzgunMFqyaGLmly39sug==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ] - }, - "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.34.2.tgz", - "integrity": "sha512-gQhK788rQJm9pzmXyfBB84VHViDERhAhzGafw+E5mUpnGKuxZGkMVDa3wgDFKT6ukLC5V7QTifzsUKdNVxp5qQ==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ] - }, - "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.34.2.tgz", - "integrity": "sha512-eiaHgQwGPpxLC3+zTAcdKl4VsBl3r0AiJOd1Um/ArEzAjN/dbPK1nROHrVkdnoE6p7Svvn04w3f/jEZSTVHunA==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ] - }, - "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.34.2.tgz", - "integrity": "sha512-lhdiwQ+jf8pewYOTG4bag0Qd68Jn1v2gO1i0mTuiD+Qkt5vNfHVK/jrT7uVvycV8ZchlzXp5HDVmhpzjC6mh0g==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ] - }, - "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.34.2.tgz", - "integrity": "sha512-lfqTpWjSvbgQP1vqGTXdv+/kxIznKXZlI109WkIFPbud41bjigjNmOAAKoazmRGx+k9e3rtIdbq2pQZPV1pMig==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.34.2.tgz", - "integrity": "sha512-RGjqULqIurqqv+NJTyuPgdZhka8ImMLB32YwUle2BPTDqDoXNgwFjdjQC59FbSk08z0IqlRJjrJ0AvDQ5W5lpw==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.34.2.tgz", - "integrity": "sha512-ZvkPiheyXtXlFqHpsdgscx+tZ7hoR59vOettvArinEspq5fxSDSgfF+L5wqqJ9R4t+n53nyn0sKxeXlik7AY9Q==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.34.2.tgz", - "integrity": "sha512-UlFk+E46TZEoxD9ufLKDBzfSG7Ki03fo6hsNRRRHF+KuvNZ5vd1RRVQm8YZlGsjcJG8R252XFK0xNPay+4WV7w==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-loongarch64-gnu": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.34.2.tgz", - "integrity": "sha512-hJhfsD9ykx59jZuuoQgYT1GEcNNi3RCoEmbo5OGfG8RlHOiVS7iVNev9rhLKh7UBYq409f4uEw0cclTXx8nh8Q==", - "cpu": [ - "loong64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.34.2.tgz", - "integrity": "sha512-g/O5IpgtrQqPegvqopvmdCF9vneLE7eqYfdPWW8yjPS8f63DNam3U4ARL1PNNB64XHZDHKpvO2Giftf43puB8Q==", - "cpu": [ - "ppc64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.34.2.tgz", - "integrity": "sha512-bSQijDC96M6PuooOuXHpvXUYiIwsnDmqGU8+br2U7iPoykNi9JtMUpN7K6xml29e0evK0/g0D1qbAUzWZFHY5Q==", - "cpu": [ - "riscv64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.34.2.tgz", - "integrity": "sha512-49TtdeVAsdRuiUHXPrFVucaP4SivazetGUVH8CIxVsNsaPHV4PFkpLmH9LeqU/R4Nbgky9lzX5Xe1NrzLyraVA==", - "cpu": [ - "s390x" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.34.2.tgz", - "integrity": "sha512-j+jFdfOycLIQ7FWKka9Zd3qvsIyugg5LeZuHF6kFlXo6MSOc6R1w37YUVy8VpAKd81LMWGi5g9J25P09M0SSIw==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.34.2.tgz", - "integrity": "sha512-aDPHyM/D2SpXfSNCVWCxyHmOqN9qb7SWkY1+vaXqMNMXslZYnwh9V/UCudl6psyG0v6Ukj7pXanIpfZwCOEMUg==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.34.2.tgz", - "integrity": "sha512-LQRkCyUBnAo7r8dbEdtNU08EKLCJMgAk2oP5H3R7BnUlKLqgR3dUjrLBVirmc1RK6U6qhtDw29Dimeer8d5hzQ==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.34.2.tgz", - "integrity": "sha512-wt8OhpQUi6JuPFkm1wbVi1BByeag87LDFzeKSXzIdGcX4bMLqORTtKxLoCbV57BHYNSUSOKlSL4BYYUghainYA==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.34.2.tgz", - "integrity": "sha512-rUrqINax0TvrPBXrFKg0YbQx18NpPN3NNrgmaao9xRNbTwek7lOXObhx8tQy8gelmQ/gLaGy1WptpU2eKJZImg==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@sec-ant/readable-stream": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz", - "integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==", - "license": "MIT" - }, - "node_modules/@types/babel__core": { - "version": "7.20.5", - "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", - "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/parser": "^7.20.7", - "@babel/types": "^7.20.7", - "@types/babel__generator": "*", - "@types/babel__template": "*", - "@types/babel__traverse": "*" - } - }, - "node_modules/@types/babel__generator": { - "version": "7.6.8", - "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.6.8.tgz", - "integrity": "sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/types": "^7.0.0" - } - }, - "node_modules/@types/babel__template": { - "version": "7.4.4", - "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", - "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/parser": "^7.1.0", - "@babel/types": "^7.0.0" - } - }, - "node_modules/@types/babel__traverse": { - "version": "7.20.6", - "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.20.6.tgz", - "integrity": "sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/types": "^7.20.7" - } - }, - "node_modules/@types/cookie": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz", - "integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA==", - "license": "MIT" - }, - "node_modules/@types/debug": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", - "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==", - "license": "MIT", - "dependencies": { - "@types/ms": "*" - } - }, - "node_modules/@types/estree": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz", - "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", - "license": "MIT" - }, - "node_modules/@types/estree-jsx": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/@types/estree-jsx/-/estree-jsx-1.0.5.tgz", - "integrity": "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==", - "license": "MIT", - "dependencies": { - "@types/estree": "*" - } - }, - "node_modules/@types/hast": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz", - "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==", - "license": "MIT", - "dependencies": { - "@types/unist": "*" - } - }, - "node_modules/@types/json-schema": { - "version": "7.0.15", - "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", - "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/katex": { - "version": "0.16.7", - "resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.7.tgz", - "integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==", - "license": "MIT" - }, - "node_modules/@types/linkify-it": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/@types/linkify-it/-/linkify-it-5.0.0.tgz", - "integrity": "sha512-sVDA58zAw4eWAffKOaQH5/5j3XeayukzDk+ewSsnv3p4yJEZHCCzMDiZM8e0OUrRvmpGZ85jf4yDHkHsgBNr9Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/markdown-it": { - "version": "14.1.2", - "resolved": "https://registry.npmjs.org/@types/markdown-it/-/markdown-it-14.1.2.tgz", - "integrity": "sha512-promo4eFwuiW+TfGxhi+0x3czqTYJkG8qB17ZUJiVF10Xm7NLVRSLUsfRTU/6h1e24VvRnXCx+hG7li58lkzog==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/linkify-it": "^5", - "@types/mdurl": "^2" - } - }, - "node_modules/@types/mdast": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz", - "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==", - "license": "MIT", - "dependencies": { - "@types/unist": "*" - } - }, - "node_modules/@types/mdurl": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@types/mdurl/-/mdurl-2.0.0.tgz", - "integrity": "sha512-RGdgjQUZba5p6QEFAVx2OGb8rQDL/cPRG7GiedRzMcJ1tYnUANBncjbSB1NRGwbvjcPeikRABz2nshyPk1bhWg==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/ms": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", - "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==", - "license": "MIT" - }, - "node_modules/@types/node": { - "version": "22.13.1", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.13.1.tgz", - "integrity": "sha512-jK8uzQlrvXqEU91UxiK5J7pKHyzgnI1Qnl0QDHIgVGuolJhRb9EEl28Cj9b3rGR8B2lhFCtvIm5os8lFnO/1Ew==", - "devOptional": true, - "license": "MIT", - "dependencies": { - "undici-types": "~6.20.0" - } - }, - "node_modules/@types/prop-types": { - "version": "15.7.14", - "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.14.tgz", - "integrity": "sha512-gNMvNH49DJ7OJYv+KAKn0Xp45p8PLl6zo2YnvDIbTd4J6MER2BmWN49TG7n9LvkyihINxeKW8+3bfS2yDC9dzQ==", - "license": "MIT" - }, - "node_modules/@types/react": { - "version": "18.3.18", - "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.18.tgz", - "integrity": "sha512-t4yC+vtgnkYjNSKlFx1jkAhH8LgTo2N/7Qvi83kdEaUtMDiwpbLAktKDaAMlRcJ5eSxZkH74eEGt1ky31d7kfQ==", - "license": "MIT", - "dependencies": { - "@types/prop-types": "*", - "csstype": "^3.0.2" - } - }, - "node_modules/@types/react-dom": { - "version": "18.3.5", - "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.5.tgz", - "integrity": "sha512-P4t6saawp+b/dFrUr2cvkVsfvPguwsxtH6dNIYRllMsefqFzkZk5UIjzyDOv5g1dXIPdG4Sp1yCR4Z6RCUsG/Q==", - "dev": true, - "license": "MIT", - "peerDependencies": { - "@types/react": "^18.0.0" - } - }, - "node_modules/@types/unist": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", - "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", - "license": "MIT" - }, - "node_modules/@typescript-eslint/eslint-plugin": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.23.0.tgz", - "integrity": "sha512-vBz65tJgRrA1Q5gWlRfvoH+w943dq9K1p1yDBY2pc+a1nbBLZp7fB9+Hk8DaALUbzjqlMfgaqlVPT1REJdkt/w==", - "dev": true, - "license": "MIT", - "dependencies": { - "@eslint-community/regexpp": "^4.10.0", - "@typescript-eslint/scope-manager": "8.23.0", - "@typescript-eslint/type-utils": "8.23.0", - "@typescript-eslint/utils": "8.23.0", - "@typescript-eslint/visitor-keys": "8.23.0", - "graphemer": "^1.4.0", - "ignore": "^5.3.1", - "natural-compare": "^1.4.0", - "ts-api-utils": "^2.0.1" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "@typescript-eslint/parser": "^8.0.0 || ^8.0.0-alpha.0", - "eslint": "^8.57.0 || ^9.0.0", - "typescript": ">=4.8.4 <5.8.0" - } - }, - "node_modules/@typescript-eslint/parser": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.23.0.tgz", - "integrity": "sha512-h2lUByouOXFAlMec2mILeELUbME5SZRN/7R9Cw2RD2lRQQY08MWMM+PmVVKKJNK1aIwqTo9t/0CvOxwPbRIE2Q==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/scope-manager": "8.23.0", - "@typescript-eslint/types": "8.23.0", - "@typescript-eslint/typescript-estree": "8.23.0", - "@typescript-eslint/visitor-keys": "8.23.0", - "debug": "^4.3.4" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0", - "typescript": ">=4.8.4 <5.8.0" - } - }, - "node_modules/@typescript-eslint/scope-manager": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.23.0.tgz", - "integrity": "sha512-OGqo7+dXHqI7Hfm+WqkZjKjsiRtFUQHPdGMXzk5mYXhJUedO7e/Y7i8AK3MyLMgZR93TX4bIzYrfyVjLC+0VSw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/types": "8.23.0", - "@typescript-eslint/visitor-keys": "8.23.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - } - }, - "node_modules/@typescript-eslint/type-utils": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.23.0.tgz", - "integrity": "sha512-iIuLdYpQWZKbiH+RkCGc6iu+VwscP5rCtQ1lyQ7TYuKLrcZoeJVpcLiG8DliXVkUxirW/PWlmS+d6yD51L9jvA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/typescript-estree": "8.23.0", - "@typescript-eslint/utils": "8.23.0", - "debug": "^4.3.4", - "ts-api-utils": "^2.0.1" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0", - "typescript": ">=4.8.4 <5.8.0" - } - }, - "node_modules/@typescript-eslint/types": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.23.0.tgz", - "integrity": "sha512-1sK4ILJbCmZOTt9k4vkoulT6/y5CHJ1qUYxqpF1K/DBAd8+ZUL4LlSCxOssuH5m4rUaaN0uS0HlVPvd45zjduQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - } - }, - "node_modules/@typescript-eslint/typescript-estree": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.23.0.tgz", - "integrity": "sha512-LcqzfipsB8RTvH8FX24W4UUFk1bl+0yTOf9ZA08XngFwMg4Kj8A+9hwz8Cr/ZS4KwHrmo9PJiLZkOt49vPnuvQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/types": "8.23.0", - "@typescript-eslint/visitor-keys": "8.23.0", - "debug": "^4.3.4", - "fast-glob": "^3.3.2", - "is-glob": "^4.0.3", - "minimatch": "^9.0.4", - "semver": "^7.6.0", - "ts-api-utils": "^2.0.1" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "typescript": ">=4.8.4 <5.8.0" - } - }, - "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", - "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", - "dev": true, - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0" - } - }, - "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": { - "version": "9.0.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", - "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", - "dev": true, - "license": "ISC", - "dependencies": { - "brace-expansion": "^2.0.1" - }, - "engines": { - "node": ">=16 || 14 >=14.17" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/@typescript-eslint/typescript-estree/node_modules/semver": { - "version": "7.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.1.tgz", - "integrity": "sha512-hlq8tAfn0m/61p4BVRcPzIGr6LKiMwo4VM6dGi6pt4qcRkmNzTcWq6eCEjEh+qXjkMDvPlOFFSGwQjoEa6gyMA==", - "dev": true, - "license": "ISC", - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/@typescript-eslint/utils": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.23.0.tgz", - "integrity": "sha512-uB/+PSo6Exu02b5ZEiVtmY6RVYO7YU5xqgzTIVZwTHvvK3HsL8tZZHFaTLFtRG3CsV4A5mhOv+NZx5BlhXPyIA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@eslint-community/eslint-utils": "^4.4.0", - "@typescript-eslint/scope-manager": "8.23.0", - "@typescript-eslint/types": "8.23.0", - "@typescript-eslint/typescript-estree": "8.23.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0", - "typescript": ">=4.8.4 <5.8.0" - } - }, - "node_modules/@typescript-eslint/visitor-keys": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.23.0.tgz", - "integrity": "sha512-oWWhcWDLwDfu++BGTZcmXWqpwtkwb5o7fxUIGksMQQDSdPW9prsSnfIOZMlsj4vBOSrcnjIUZMiIjODgGosFhQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/types": "8.23.0", - "eslint-visitor-keys": "^4.2.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - } - }, - "node_modules/@ungap/structured-clone": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz", - "integrity": "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==", - "license": "ISC" - }, - "node_modules/@vitejs/plugin-react": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.3.4.tgz", - "integrity": "sha512-SCCPBJtYLdE8PX/7ZQAs1QAZ8Jqwih+0VBLum1EGqmCCQal+MIUqLCzj3ZUy8ufbC0cAM4LRlSTm7IQJwWT4ug==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/core": "^7.26.0", - "@babel/plugin-transform-react-jsx-self": "^7.25.9", - "@babel/plugin-transform-react-jsx-source": "^7.25.9", - "@types/babel__core": "^7.20.5", - "react-refresh": "^0.14.2" - }, - "engines": { - "node": "^14.18.0 || >=16.0.0" - }, - "peerDependencies": { - "vite": "^4.2.0 || ^5.0.0 || ^6.0.0" - } - }, - "node_modules/@vscode/markdown-it-katex": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz", - "integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==", - "license": "MIT", - "dependencies": { - "katex": "^0.16.4" - } - }, - "node_modules/acorn": { - "version": "8.14.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.0.tgz", - "integrity": "sha512-cl669nCJTZBsL97OF4kUQm5g5hC2uihk0NxY3WENAC0TYdILVkAyHymAntgxGkl7K+t0cXIrH5siy5S4XkFycA==", - "dev": true, - "license": "MIT", - "bin": { - "acorn": "bin/acorn" - }, - "engines": { - "node": ">=0.4.0" - } - }, - "node_modules/acorn-jsx": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz", - "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==", - "dev": true, - "license": "MIT", - "peerDependencies": { - "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" - } - }, - "node_modules/ajv": { - "version": "6.12.6", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", - "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" - } - }, - "node_modules/ansi-regex": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", - "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-regex?sponsor=1" - } - }, - "node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "license": "MIT", - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/any-promise": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", - "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", - "license": "MIT" - }, - "node_modules/anymatch": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", - "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", - "license": "ISC", - "dependencies": { - "normalize-path": "^3.0.0", - "picomatch": "^2.0.4" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/arg": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", - "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==", - "license": "MIT" - }, - "node_modules/argparse": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", - "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", - "dev": true, - "license": "Python-2.0" - }, - "node_modules/autoprefixer": { - "version": "10.4.20", - "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz", - "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/autoprefixer" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "browserslist": "^4.23.3", - "caniuse-lite": "^1.0.30001646", - "fraction.js": "^4.3.7", - "normalize-range": "^0.1.2", - "picocolors": "^1.0.1", - "postcss-value-parser": "^4.2.0" - }, - "bin": { - "autoprefixer": "bin/autoprefixer" - }, - "engines": { - "node": "^10 || ^12 || >=14" - }, - "peerDependencies": { - "postcss": "^8.1.0" - } - }, - "node_modules/bail": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz", - "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/balanced-match": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "license": "MIT" - }, - "node_modules/binary-extensions": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", - "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", - "license": "MIT", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/braces": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", - "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", - "license": "MIT", - "dependencies": { - "fill-range": "^7.1.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/browserslist": { - "version": "4.24.4", - "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.4.tgz", - "integrity": "sha512-KDi1Ny1gSePi1vm0q4oxSF8b4DR44GF4BbmS2YdhPLOEqd8pDviZOGH/GsmRwoWJ2+5Lr085X7naowMwKHDG1A==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/browserslist" - }, - { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/browserslist" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "caniuse-lite": "^1.0.30001688", - "electron-to-chromium": "^1.5.73", - "node-releases": "^2.0.19", - "update-browserslist-db": "^1.1.1" - }, - "bin": { - "browserslist": "cli.js" - }, - "engines": { - "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" - } - }, - "node_modules/buffer-builder": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz", - "integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==", - "devOptional": true, - "license": "MIT/X11" - }, - "node_modules/callsites": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", - "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/camelcase-css": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz", - "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==", - "license": "MIT", - "engines": { - "node": ">= 6" - } - }, - "node_modules/caniuse-lite": { - "version": "1.0.30001697", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001697.tgz", - "integrity": "sha512-GwNPlWJin8E+d7Gxq96jxM6w0w+VFeyyXRsjU58emtkYqnbwHqXm5uT2uCmO0RQE9htWknOP4xtBlLmM/gWxvQ==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/browserslist" - }, - { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/caniuse-lite" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "CC-BY-4.0" - }, - "node_modules/ccount": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", - "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "license": "MIT", - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, - "node_modules/character-entities": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz", - "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/character-entities-html4": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz", - "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/character-entities-legacy": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz", - "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/character-reference-invalid": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-2.0.1.tgz", - "integrity": "sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/chokidar": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", - "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", - "license": "MIT", - "dependencies": { - "anymatch": "~3.1.2", - "braces": "~3.0.2", - "glob-parent": "~5.1.2", - "is-binary-path": "~2.1.0", - "is-glob": "~4.0.1", - "normalize-path": "~3.0.0", - "readdirp": "~3.6.0" - }, - "engines": { - "node": ">= 8.10.0" - }, - "funding": { - "url": "https://paulmillr.com/funding/" - }, - "optionalDependencies": { - "fsevents": "~2.3.2" - } - }, - "node_modules/chokidar/node_modules/glob-parent": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", - "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", - "license": "ISC", - "dependencies": { - "is-glob": "^4.0.1" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", - "license": "MIT", - "dependencies": { - "color-name": "~1.1.4" - }, - "engines": { - "node": ">=7.0.0" - } - }, - "node_modules/color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", - "license": "MIT" - }, - "node_modules/colorjs.io": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz", - "integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==", - "devOptional": true, - "license": "MIT" - }, - "node_modules/comma-separated-tokens": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", - "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/commander": { - "version": "8.3.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", - "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==", - "license": "MIT", - "engines": { - "node": ">= 12" - } - }, - "node_modules/concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true, - "license": "MIT" - }, - "node_modules/convert-source-map": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", - "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", - "dev": true, - "license": "MIT" - }, - "node_modules/cookie": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.0.2.tgz", - "integrity": "sha512-9Kr/j4O16ISv8zBBhJoi4bXOYNTkFLOqSL3UDB0njXxCXNezjeyVrJyGOWtgfs/q2km1gwBcfH8q1yEGoMYunA==", - "license": "MIT", - "engines": { - "node": ">=18" - } - }, - "node_modules/cross-spawn": { - "version": "7.0.6", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", - "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", - "license": "MIT", - "dependencies": { - "path-key": "^3.1.0", - "shebang-command": "^2.0.0", - "which": "^2.0.1" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/css-selector-tokenizer": { - "version": "0.8.0", - "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz", - "integrity": "sha512-Jd6Ig3/pe62/qe5SBPTN8h8LeUg/pT4lLgtavPf7updwwHpvFzxvOQBHYj2LZDMjUnBzgvIUSjRcf6oT5HzHFg==", - "license": "MIT", - "dependencies": { - "cssesc": "^3.0.0", - "fastparse": "^1.1.2" - } - }, - "node_modules/cssesc": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", - "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", - "license": "MIT", - "bin": { - "cssesc": "bin/cssesc" - }, - "engines": { - "node": ">=4" - } - }, - "node_modules/csstype": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", - "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", - "license": "MIT" - }, - "node_modules/culori": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/culori/-/culori-3.3.0.tgz", - "integrity": "sha512-pHJg+jbuFsCjz9iclQBqyL3B2HLCBF71BwVNujUYEvCeQMvV97R59MNK3R2+jgJ3a1fcZgI9B3vYgz8lzr/BFQ==", - "license": "MIT", - "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" - } - }, - "node_modules/daisyui": { - "version": "4.12.23", - "resolved": "https://registry.npmjs.org/daisyui/-/daisyui-4.12.23.tgz", - "integrity": "sha512-EM38duvxutJ5PD65lO/AFMpcw+9qEy6XAZrTpzp7WyaPeO/l+F/Qiq0ECHHmFNcFXh5aVoALY4MGrrxtCiaQCQ==", - "license": "MIT", - "dependencies": { - "css-selector-tokenizer": "^0.8", - "culori": "^3", - "picocolors": "^1", - "postcss-js": "^4" - }, - "engines": { - "node": ">=16.9.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/daisyui" - } - }, - "node_modules/debug": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz", - "integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==", - "license": "MIT", - "dependencies": { - "ms": "^2.1.3" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } - } - }, - "node_modules/decode-named-character-reference": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz", - "integrity": "sha512-O8x12RzrUF8xyVcY0KJowWsmaJxQbmy0/EtnNtHRpsOcT7dFk5W598coHqBVpmWo1oQQfsCqfCmkZN5DJrZVdg==", - "license": "MIT", - "dependencies": { - "character-entities": "^2.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/deep-is": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz", - "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/dequal": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", - "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/devlop": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz", - "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==", - "license": "MIT", - "dependencies": { - "dequal": "^2.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/didyoumean": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz", - "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==", - "license": "Apache-2.0" - }, - "node_modules/dlv": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", - "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==", - "license": "MIT" - }, - "node_modules/eastasianwidth": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", - "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", - "license": "MIT" - }, - "node_modules/electron-to-chromium": { - "version": "1.5.91", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.91.tgz", - "integrity": "sha512-sNSHHyq048PFmZY4S90ax61q+gLCs0X0YmcOII9wG9S2XwbVr+h4VW2wWhnbp/Eys3cCwTxVF292W3qPaxIapQ==", - "license": "ISC" - }, - "node_modules/emoji-regex": { - "version": "9.2.2", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", - "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", - "license": "MIT" - }, - "node_modules/entities": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", - "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", - "license": "BSD-2-Clause", - "engines": { - "node": ">=0.12" - }, - "funding": { - "url": "https://github.com/fb55/entities?sponsor=1" - } - }, - "node_modules/esbuild": { - "version": "0.24.2", - "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.24.2.tgz", - "integrity": "sha512-+9egpBW8I3CD5XPe0n6BfT5fxLzxrlDzqydF3aviG+9ni1lDC/OvMHcxqEFV0+LANZG5R1bFMWfUrjVsdwxJvA==", - "hasInstallScript": true, - "license": "MIT", - "bin": { - "esbuild": "bin/esbuild" - }, - "engines": { - "node": ">=18" - }, - "optionalDependencies": { - "@esbuild/aix-ppc64": "0.24.2", - "@esbuild/android-arm": "0.24.2", - "@esbuild/android-arm64": "0.24.2", - "@esbuild/android-x64": "0.24.2", - "@esbuild/darwin-arm64": "0.24.2", - "@esbuild/darwin-x64": "0.24.2", - "@esbuild/freebsd-arm64": "0.24.2", - "@esbuild/freebsd-x64": "0.24.2", - "@esbuild/linux-arm": "0.24.2", - "@esbuild/linux-arm64": "0.24.2", - "@esbuild/linux-ia32": "0.24.2", - "@esbuild/linux-loong64": "0.24.2", - "@esbuild/linux-mips64el": "0.24.2", - "@esbuild/linux-ppc64": "0.24.2", - "@esbuild/linux-riscv64": "0.24.2", - "@esbuild/linux-s390x": "0.24.2", - "@esbuild/linux-x64": "0.24.2", - "@esbuild/netbsd-arm64": "0.24.2", - "@esbuild/netbsd-x64": "0.24.2", - "@esbuild/openbsd-arm64": "0.24.2", - "@esbuild/openbsd-x64": "0.24.2", - "@esbuild/sunos-x64": "0.24.2", - "@esbuild/win32-arm64": "0.24.2", - "@esbuild/win32-ia32": "0.24.2", - "@esbuild/win32-x64": "0.24.2" - } - }, - "node_modules/escalade": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", - "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/escape-string-regexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", - "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/eslint": { - "version": "9.19.0", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.19.0.tgz", - "integrity": "sha512-ug92j0LepKlbbEv6hD911THhoRHmbdXt2gX+VDABAW/Ir7D3nqKdv5Pf5vtlyY6HQMTEP2skXY43ueqTCWssEA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@eslint-community/eslint-utils": "^4.2.0", - "@eslint-community/regexpp": "^4.12.1", - "@eslint/config-array": "^0.19.0", - "@eslint/core": "^0.10.0", - "@eslint/eslintrc": "^3.2.0", - "@eslint/js": "9.19.0", - "@eslint/plugin-kit": "^0.2.5", - "@humanfs/node": "^0.16.6", - "@humanwhocodes/module-importer": "^1.0.1", - "@humanwhocodes/retry": "^0.4.1", - "@types/estree": "^1.0.6", - "@types/json-schema": "^7.0.15", - "ajv": "^6.12.4", - "chalk": "^4.0.0", - "cross-spawn": "^7.0.6", - "debug": "^4.3.2", - "escape-string-regexp": "^4.0.0", - "eslint-scope": "^8.2.0", - "eslint-visitor-keys": "^4.2.0", - "espree": "^10.3.0", - "esquery": "^1.5.0", - "esutils": "^2.0.2", - "fast-deep-equal": "^3.1.3", - "file-entry-cache": "^8.0.0", - "find-up": "^5.0.0", - "glob-parent": "^6.0.2", - "ignore": "^5.2.0", - "imurmurhash": "^0.1.4", - "is-glob": "^4.0.0", - "json-stable-stringify-without-jsonify": "^1.0.1", - "lodash.merge": "^4.6.2", - "minimatch": "^3.1.2", - "natural-compare": "^1.4.0", - "optionator": "^0.9.3" - }, - "bin": { - "eslint": "bin/eslint.js" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://eslint.org/donate" - }, - "peerDependencies": { - "jiti": "*" - }, - "peerDependenciesMeta": { - "jiti": { - "optional": true - } - } - }, - "node_modules/eslint-plugin-react-hooks": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-react-hooks/-/eslint-plugin-react-hooks-5.1.0.tgz", - "integrity": "sha512-mpJRtPgHN2tNAvZ35AMfqeB3Xqeo273QxrHJsbBEPWODRM4r0yB6jfoROqKEYrOn27UtRPpcpHc2UqyBSuUNTw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "peerDependencies": { - "eslint": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0-0 || ^9.0.0" - } - }, - "node_modules/eslint-plugin-react-refresh": { - "version": "0.4.18", - "resolved": "https://registry.npmjs.org/eslint-plugin-react-refresh/-/eslint-plugin-react-refresh-0.4.18.tgz", - "integrity": "sha512-IRGEoFn3OKalm3hjfolEWGqoF/jPqeEYFp+C8B0WMzwGwBMvlRDQd06kghDhF0C61uJ6WfSDhEZE/sAQjduKgw==", - "dev": true, - "license": "MIT", - "peerDependencies": { - "eslint": ">=8.40" - } - }, - "node_modules/eslint-scope": { - "version": "8.2.0", - "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.2.0.tgz", - "integrity": "sha512-PHlWUfG6lvPc3yvP5A4PNyBL1W8fkDUccmI21JUu/+GKZBoH/W5u6usENXUrWFRsyoW5ACUjFGgAFQp5gUlb/A==", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "esrecurse": "^4.3.0", - "estraverse": "^5.2.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/eslint-visitor-keys": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.0.tgz", - "integrity": "sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/espree": { - "version": "10.3.0", - "resolved": "https://registry.npmjs.org/espree/-/espree-10.3.0.tgz", - "integrity": "sha512-0QYC8b24HWY8zjRnDTL6RiHfDbAWn63qb4LMj1Z4b076A4une81+z03Kg7l7mn/48PUTqoLptSXez8oknU8Clg==", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "acorn": "^8.14.0", - "acorn-jsx": "^5.3.2", - "eslint-visitor-keys": "^4.2.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/esquery": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz", - "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==", - "dev": true, - "license": "BSD-3-Clause", - "dependencies": { - "estraverse": "^5.1.0" - }, - "engines": { - "node": ">=0.10" - } - }, - "node_modules/esrecurse": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", - "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "estraverse": "^5.2.0" - }, - "engines": { - "node": ">=4.0" - } - }, - "node_modules/estraverse": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", - "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", - "dev": true, - "license": "BSD-2-Clause", - "engines": { - "node": ">=4.0" - } - }, - "node_modules/estree-util-is-identifier-name": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/estree-util-is-identifier-name/-/estree-util-is-identifier-name-3.0.0.tgz", - "integrity": "sha512-hFtqIDZTIUZ9BXLb8y4pYGyk6+wekIivNVTcmvk8NoOh+VeRn5y6cEHzbURrWbfp1fIqdVipilzj+lfaadNZmg==", - "license": "MIT", - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/esutils": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", - "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", - "dev": true, - "license": "BSD-2-Clause", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", - "license": "MIT" - }, - "node_modules/fast-deep-equal": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", - "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/fast-glob": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz", - "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==", - "license": "MIT", - "dependencies": { - "@nodelib/fs.stat": "^2.0.2", - "@nodelib/fs.walk": "^1.2.3", - "glob-parent": "^5.1.2", - "merge2": "^1.3.0", - "micromatch": "^4.0.8" - }, - "engines": { - "node": ">=8.6.0" - } - }, - "node_modules/fast-glob/node_modules/glob-parent": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", - "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", - "license": "ISC", - "dependencies": { - "is-glob": "^4.0.1" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/fast-json-stable-stringify": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", - "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==", - "dev": true, - "license": "MIT" - }, - "node_modules/fast-levenshtein": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", - "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", - "dev": true, - "license": "MIT" - }, - "node_modules/fastparse": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.2.tgz", - "integrity": "sha512-483XLLxTVIwWK3QTrMGRqUfUpoOs/0hbQrl2oz4J0pAcm3A3bu84wxTFqGqkJzewCLdME38xJLJAxBABfQT8sQ==", - "license": "MIT" - }, - "node_modules/fastq": { - "version": "1.19.0", - "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.0.tgz", - "integrity": "sha512-7SFSRCNjBQIZH/xZR3iy5iQYR8aGBE0h3VG6/cwlbrpdciNYBMotQav8c1XI3HjHH+NikUpP53nPdlZSdWmFzA==", - "license": "ISC", - "dependencies": { - "reusify": "^1.0.4" - } - }, - "node_modules/file-entry-cache": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz", - "integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "flat-cache": "^4.0.0" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/fill-range": { - "version": "7.1.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", - "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", - "license": "MIT", - "dependencies": { - "to-regex-range": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/find-up": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", - "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", - "dev": true, - "license": "MIT", - "dependencies": { - "locate-path": "^6.0.0", - "path-exists": "^4.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/flat-cache": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz", - "integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==", - "dev": true, - "license": "MIT", - "dependencies": { - "flatted": "^3.2.9", - "keyv": "^4.5.4" - }, - "engines": { - "node": ">=16" - } - }, - "node_modules/flatted": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.2.tgz", - "integrity": "sha512-AiwGJM8YcNOaobumgtng+6NHuOqC3A7MixFeDafM3X9cIUM+xUXoS5Vfgf+OihAYe20fxqNM9yPBXJzRtZ/4eA==", - "dev": true, - "license": "ISC" - }, - "node_modules/foreground-child": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz", - "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==", - "license": "ISC", - "dependencies": { - "cross-spawn": "^7.0.0", - "signal-exit": "^4.0.1" - }, - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/fraction.js": { - "version": "4.3.7", - "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", - "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==", - "license": "MIT", - "engines": { - "node": "*" - }, - "funding": { - "type": "patreon", - "url": "https://github.com/sponsors/rawify" - } - }, - "node_modules/fsevents": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", - "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "hasInstallScript": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": "^8.16.0 || ^10.6.0 || >=11.0.0" - } - }, - "node_modules/function-bind": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", - "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/gensync": { - "version": "1.0.0-beta.2", - "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", - "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/glob": { - "version": "10.4.5", - "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", - "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", - "license": "ISC", - "dependencies": { - "foreground-child": "^3.1.0", - "jackspeak": "^3.1.2", - "minimatch": "^9.0.4", - "minipass": "^7.1.2", - "package-json-from-dist": "^1.0.0", - "path-scurry": "^1.11.1" - }, - "bin": { - "glob": "dist/esm/bin.mjs" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/glob-parent": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", - "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", - "license": "ISC", - "dependencies": { - "is-glob": "^4.0.3" - }, - "engines": { - "node": ">=10.13.0" - } - }, - "node_modules/glob/node_modules/brace-expansion": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", - "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0" - } - }, - "node_modules/glob/node_modules/minimatch": { - "version": "9.0.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", - "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", - "license": "ISC", - "dependencies": { - "brace-expansion": "^2.0.1" - }, - "engines": { - "node": ">=16 || 14 >=14.17" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/globals": { - "version": "15.14.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-15.14.0.tgz", - "integrity": "sha512-OkToC372DtlQeje9/zHIo5CT8lRP/FUgEOKBEhU4e0abL7J7CD24fD9ohiLN5hagG/kWCYj4K5oaxxtj2Z0Dig==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/graphemer": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz", - "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==", - "dev": true, - "license": "MIT" - }, - "node_modules/has-flag": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", - "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", - "devOptional": true, - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/hasown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", - "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", - "license": "MIT", - "dependencies": { - "function-bind": "^1.1.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/hast-util-from-dom": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz", - "integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==", - "license": "ISC", - "dependencies": { - "@types/hast": "^3.0.0", - "hastscript": "^9.0.0", - "web-namespaces": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-from-html": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz", - "integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "devlop": "^1.1.0", - "hast-util-from-parse5": "^8.0.0", - "parse5": "^7.0.0", - "vfile": "^6.0.0", - "vfile-message": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-from-html-isomorphic": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz", - "integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "hast-util-from-dom": "^5.0.0", - "hast-util-from-html": "^2.0.0", - "unist-util-remove-position": "^5.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-from-parse5": { - "version": "8.0.2", - "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.2.tgz", - "integrity": "sha512-SfMzfdAi/zAoZ1KkFEyyeXBn7u/ShQrfd675ZEE9M3qj+PMFX05xubzRyF76CCSJu8au9jgVxDV1+okFvgZU4A==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/unist": "^3.0.0", - "devlop": "^1.0.0", - "hastscript": "^9.0.0", - "property-information": "^6.0.0", - "vfile": "^6.0.0", - "vfile-location": "^5.0.0", - "web-namespaces": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-is-element": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", - "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-parse-selector": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", - "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-to-jsx-runtime": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.2.tgz", - "integrity": "sha512-1ngXYb+V9UT5h+PxNRa1O1FYguZK/XL+gkeqvp7EdHlB9oHUG0eYRo/vY5inBdcqo3RkPMC58/H94HvkbfGdyg==", - "license": "MIT", - "dependencies": { - "@types/estree": "^1.0.0", - "@types/hast": "^3.0.0", - "@types/unist": "^3.0.0", - "comma-separated-tokens": "^2.0.0", - "devlop": "^1.0.0", - "estree-util-is-identifier-name": "^3.0.0", - "hast-util-whitespace": "^3.0.0", - "mdast-util-mdx-expression": "^2.0.0", - "mdast-util-mdx-jsx": "^3.0.0", - "mdast-util-mdxjs-esm": "^2.0.0", - "property-information": "^6.0.0", - "space-separated-tokens": "^2.0.0", - "style-to-object": "^1.0.0", - "unist-util-position": "^5.0.0", - "vfile-message": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-to-text": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz", - "integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/unist": "^3.0.0", - "hast-util-is-element": "^3.0.0", - "unist-util-find-after": "^5.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-whitespace": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", - "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hastscript": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.0.tgz", - "integrity": "sha512-jzaLBGavEDKHrc5EfFImKN7nZKKBdSLIdGvCwDZ9TfzbF2ffXiov8CKE445L2Z1Ek2t/m4SKQ2j6Ipv7NyUolw==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "comma-separated-tokens": "^2.0.0", - "hast-util-parse-selector": "^4.0.0", - "property-information": "^6.0.0", - "space-separated-tokens": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/highlight.js": { - "version": "11.11.1", - "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz", - "integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==", - "license": "BSD-3-Clause", - "engines": { - "node": ">=12.0.0" - } - }, - "node_modules/html-url-attributes": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz", - "integrity": "sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==", - "license": "MIT", - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/ignore": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", - "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, - "node_modules/immutable": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz", - "integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==", - "devOptional": true, - "license": "MIT" - }, - "node_modules/import-fresh": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", - "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "parent-module": "^1.0.0", - "resolve-from": "^4.0.0" - }, - "engines": { - "node": ">=6" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/imurmurhash": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", - "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.8.19" - } - }, - "node_modules/inline-style-parser": { - "version": "0.2.4", - "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.4.tgz", - "integrity": "sha512-0aO8FkhNZlj/ZIbNi7Lxxr12obT7cL1moPfE4tg1LkX7LlLfC6DeX4l2ZEud1ukP9jNQyNnfzQVqwbwmAATY4Q==", - "license": "MIT" - }, - "node_modules/is-alphabetical": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz", - "integrity": "sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/is-alphanumerical": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-2.0.1.tgz", - "integrity": "sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==", - "license": "MIT", - "dependencies": { - "is-alphabetical": "^2.0.0", - "is-decimal": "^2.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/is-binary-path": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", - "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", - "license": "MIT", - "dependencies": { - "binary-extensions": "^2.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/is-core-module": { - "version": "2.16.1", - "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", - "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==", - "license": "MIT", - "dependencies": { - "hasown": "^2.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-decimal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-2.0.1.tgz", - "integrity": "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/is-glob": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", - "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", - "license": "MIT", - "dependencies": { - "is-extglob": "^2.1.1" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/is-hexadecimal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-2.0.1.tgz", - "integrity": "sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/is-number": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", - "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", - "license": "MIT", - "engines": { - "node": ">=0.12.0" - } - }, - "node_modules/is-plain-obj": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", - "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==", - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/isexe": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", - "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", - "license": "ISC" - }, - "node_modules/jackspeak": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", - "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", - "license": "BlueOak-1.0.0", - "dependencies": { - "@isaacs/cliui": "^8.0.2" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - }, - "optionalDependencies": { - "@pkgjs/parseargs": "^0.11.0" - } - }, - "node_modules/jiti": { - "version": "1.21.7", - "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz", - "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", - "license": "MIT", - "bin": { - "jiti": "bin/jiti.js" - } - }, - "node_modules/js-tokens": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", - "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", - "license": "MIT" - }, - "node_modules/js-yaml": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", - "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", - "dev": true, - "license": "MIT", - "dependencies": { - "argparse": "^2.0.1" - }, - "bin": { - "js-yaml": "bin/js-yaml.js" - } - }, - "node_modules/jsesc": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", - "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==", - "dev": true, - "license": "MIT", - "bin": { - "jsesc": "bin/jsesc" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/json-buffer": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", - "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true, - "license": "MIT" - }, - "node_modules/json-stable-stringify-without-jsonify": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", - "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==", - "dev": true, - "license": "MIT" - }, - "node_modules/json5": { - "version": "2.2.3", - "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", - "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", - "dev": true, - "license": "MIT", - "bin": { - "json5": "lib/cli.js" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/katex": { - "version": "0.16.21", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.21.tgz", - "integrity": "sha512-XvqR7FgOHtWupfMiigNzmh+MgUVmDGU2kXZm899ZkPfcuoPuFxyHmXsgATDpFZDAXCI8tvinaVcDo8PIIJSo4A==", - "funding": [ - "https://opencollective.com/katex", - "https://github.com/sponsors/katex" - ], - "license": "MIT", - "dependencies": { - "commander": "^8.3.0" - }, - "bin": { - "katex": "cli.js" - } - }, - "node_modules/keyv": { - "version": "4.5.4", - "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", - "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==", - "dev": true, - "license": "MIT", - "dependencies": { - "json-buffer": "3.0.1" - } - }, - "node_modules/levn": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", - "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "prelude-ls": "^1.2.1", - "type-check": "~0.4.0" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/lilconfig": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", - "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", - "license": "MIT", - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/antonk52" - } - }, - "node_modules/lines-and-columns": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", - "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", - "license": "MIT" - }, - "node_modules/locate-path": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", - "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", - "dev": true, - "license": "MIT", - "dependencies": { - "p-locate": "^5.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/lodash.merge": { - "version": "4.6.2", - "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", - "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/longest-streak": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz", - "integrity": "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/loose-envify": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", - "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", - "license": "MIT", - "dependencies": { - "js-tokens": "^3.0.0 || ^4.0.0" - }, - "bin": { - "loose-envify": "cli.js" - } - }, - "node_modules/lowlight": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-3.3.0.tgz", - "integrity": "sha512-0JNhgFoPvP6U6lE/UdVsSq99tn6DhjjpAj5MxG49ewd2mOBVtwWYIT8ClyABhq198aXXODMU6Ox8DrGy/CpTZQ==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "devlop": "^1.0.0", - "highlight.js": "~11.11.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/lru-cache": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", - "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", - "dev": true, - "license": "ISC", - "dependencies": { - "yallist": "^3.0.2" - } - }, - "node_modules/markdown-table": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", - "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/mdast-util-find-and-replace": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz", - "integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "escape-string-regexp": "^5.0.0", - "unist-util-is": "^6.0.0", - "unist-util-visit-parents": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", - "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/mdast-util-from-markdown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.2.tgz", - "integrity": "sha512-uZhTV/8NBuw0WHkPTrCqDOl0zVe1BIng5ZtHoDk49ME1qqcjYmmLmOf0gELgcRMxN4w2iuIeVso5/6QymSrgmA==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "@types/unist": "^3.0.0", - "decode-named-character-reference": "^1.0.0", - "devlop": "^1.0.0", - "mdast-util-to-string": "^4.0.0", - "micromark": "^4.0.0", - "micromark-util-decode-numeric-character-reference": "^2.0.0", - "micromark-util-decode-string": "^2.0.0", - "micromark-util-normalize-identifier": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0", - "unist-util-stringify-position": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-gfm": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.0.0.tgz", - "integrity": "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw==", - "license": "MIT", - "dependencies": { - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-gfm-autolink-literal": "^2.0.0", - "mdast-util-gfm-footnote": "^2.0.0", - "mdast-util-gfm-strikethrough": "^2.0.0", - "mdast-util-gfm-table": "^2.0.0", - "mdast-util-gfm-task-list-item": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-gfm-autolink-literal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz", - "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "ccount": "^2.0.0", - "devlop": "^1.0.0", - "mdast-util-find-and-replace": "^3.0.0", - "micromark-util-character": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-gfm-footnote": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.0.0.tgz", - "integrity": "sha512-5jOT2boTSVkMnQ7LTrd6n/18kqwjmuYqo7JUPe+tRCY6O7dAuTFMtTPauYYrMPpox9hlN0uOx/FL8XvEfG9/mQ==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "devlop": "^1.1.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0", - "micromark-util-normalize-identifier": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-gfm-strikethrough": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz", - "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-gfm-table": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz", - "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "devlop": "^1.0.0", - "markdown-table": "^3.0.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-gfm-task-list-item": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz", - "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "devlop": "^1.0.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-math": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-math/-/mdast-util-math-3.0.0.tgz", - "integrity": "sha512-Tl9GBNeG/AhJnQM221bJR2HPvLOSnLE/T9cJI9tlc6zwQk2nPk/4f0cHkOdEixQPC/j8UtKDdITswvLAy1OZ1w==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "devlop": "^1.0.0", - "longest-streak": "^3.0.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.1.0", - "unist-util-remove-position": "^5.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-mdx-expression": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz", - "integrity": "sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==", - "license": "MIT", - "dependencies": { - "@types/estree-jsx": "^1.0.0", - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "devlop": "^1.0.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-mdx-jsx": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.2.0.tgz", - "integrity": "sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==", - "license": "MIT", - "dependencies": { - "@types/estree-jsx": "^1.0.0", - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "@types/unist": "^3.0.0", - "ccount": "^2.0.0", - "devlop": "^1.1.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0", - "parse-entities": "^4.0.0", - "stringify-entities": "^4.0.0", - "unist-util-stringify-position": "^4.0.0", - "vfile-message": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-mdxjs-esm": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz", - "integrity": "sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==", - "license": "MIT", - "dependencies": { - "@types/estree-jsx": "^1.0.0", - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "devlop": "^1.0.0", - "mdast-util-from-markdown": "^2.0.0", - "mdast-util-to-markdown": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-newline-to-break": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-newline-to-break/-/mdast-util-newline-to-break-2.0.0.tgz", - "integrity": "sha512-MbgeFca0hLYIEx/2zGsszCSEJJ1JSCdiY5xQxRcLDDGa8EPvlLPupJ4DSajbMPAnC0je8jfb9TiUATnxxrHUog==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-find-and-replace": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-phrasing": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz", - "integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "unist-util-is": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-to-hast": { - "version": "13.2.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz", - "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "@ungap/structured-clone": "^1.0.0", - "devlop": "^1.0.0", - "micromark-util-sanitize-uri": "^2.0.0", - "trim-lines": "^3.0.0", - "unist-util-position": "^5.0.0", - "unist-util-visit": "^5.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-to-markdown": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.2.tgz", - "integrity": "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "@types/unist": "^3.0.0", - "longest-streak": "^3.0.0", - "mdast-util-phrasing": "^4.0.0", - "mdast-util-to-string": "^4.0.0", - "micromark-util-classify-character": "^2.0.0", - "micromark-util-decode-string": "^2.0.0", - "unist-util-visit": "^5.0.0", - "zwitch": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/mdast-util-to-string": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz", - "integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/merge2": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", - "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, - "node_modules/micromark": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.1.tgz", - "integrity": "sha512-eBPdkcoCNvYcxQOAKAlceo5SNdzZWfF+FcSupREAzdAh9rRmE239CEQAiTwIgblwnoM8zzj35sZ5ZwvSEOF6Kw==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "@types/debug": "^4.0.0", - "debug": "^4.0.0", - "decode-named-character-reference": "^1.0.0", - "devlop": "^1.0.0", - "micromark-core-commonmark": "^2.0.0", - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-chunked": "^2.0.0", - "micromark-util-combine-extensions": "^2.0.0", - "micromark-util-decode-numeric-character-reference": "^2.0.0", - "micromark-util-encode": "^2.0.0", - "micromark-util-normalize-identifier": "^2.0.0", - "micromark-util-resolve-all": "^2.0.0", - "micromark-util-sanitize-uri": "^2.0.0", - "micromark-util-subtokenize": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-core-commonmark": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/micromark-core-commonmark/-/micromark-core-commonmark-2.0.2.tgz", - "integrity": "sha512-FKjQKbxd1cibWMM1P9N+H8TwlgGgSkWZMmfuVucLCHaYqeSvJ0hFeHsIa65pA2nYbes0f8LDHPMrd9X7Ujxg9w==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "decode-named-character-reference": "^1.0.0", - "devlop": "^1.0.0", - "micromark-factory-destination": "^2.0.0", - "micromark-factory-label": "^2.0.0", - "micromark-factory-space": "^2.0.0", - "micromark-factory-title": "^2.0.0", - "micromark-factory-whitespace": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-chunked": "^2.0.0", - "micromark-util-classify-character": "^2.0.0", - "micromark-util-html-tag-name": "^2.0.0", - "micromark-util-normalize-identifier": "^2.0.0", - "micromark-util-resolve-all": "^2.0.0", - "micromark-util-subtokenize": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-extension-gfm": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", - "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", - "license": "MIT", - "dependencies": { - "micromark-extension-gfm-autolink-literal": "^2.0.0", - "micromark-extension-gfm-footnote": "^2.0.0", - "micromark-extension-gfm-strikethrough": "^2.0.0", - "micromark-extension-gfm-table": "^2.0.0", - "micromark-extension-gfm-tagfilter": "^2.0.0", - "micromark-extension-gfm-task-list-item": "^2.0.0", - "micromark-util-combine-extensions": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-gfm-autolink-literal": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz", - "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==", - "license": "MIT", - "dependencies": { - "micromark-util-character": "^2.0.0", - "micromark-util-sanitize-uri": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-gfm-footnote": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz", - "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==", - "license": "MIT", - "dependencies": { - "devlop": "^1.0.0", - "micromark-core-commonmark": "^2.0.0", - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-normalize-identifier": "^2.0.0", - "micromark-util-sanitize-uri": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-gfm-strikethrough": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz", - "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==", - "license": "MIT", - "dependencies": { - "devlop": "^1.0.0", - "micromark-util-chunked": "^2.0.0", - "micromark-util-classify-character": "^2.0.0", - "micromark-util-resolve-all": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-gfm-table": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz", - "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==", - "license": "MIT", - "dependencies": { - "devlop": "^1.0.0", - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-gfm-tagfilter": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz", - "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==", - "license": "MIT", - "dependencies": { - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-gfm-task-list-item": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz", - "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==", - "license": "MIT", - "dependencies": { - "devlop": "^1.0.0", - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-extension-math": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/micromark-extension-math/-/micromark-extension-math-3.1.0.tgz", - "integrity": "sha512-lvEqd+fHjATVs+2v/8kg9i5Q0AP2k85H0WUOwpIVvUML8BapsMvh1XAogmQjOCsLpoKRCVQqEkQBB3NhVBcsOg==", - "license": "MIT", - "dependencies": { - "@types/katex": "^0.16.0", - "devlop": "^1.0.0", - "katex": "^0.16.0", - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/micromark-factory-destination": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", - "integrity": "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-factory-label": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-factory-label/-/micromark-factory-label-2.0.1.tgz", - "integrity": "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "devlop": "^1.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-factory-space": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-factory-space/-/micromark-factory-space-2.0.1.tgz", - "integrity": "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-character": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-factory-title": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-factory-title/-/micromark-factory-title-2.0.1.tgz", - "integrity": "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-factory-whitespace": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-factory-whitespace/-/micromark-factory-whitespace-2.0.1.tgz", - "integrity": "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-factory-space": "^2.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-util-character": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz", - "integrity": "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-util-chunked": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-chunked/-/micromark-util-chunked-2.0.1.tgz", - "integrity": "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-symbol": "^2.0.0" - } - }, - "node_modules/micromark-util-classify-character": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-classify-character/-/micromark-util-classify-character-2.0.1.tgz", - "integrity": "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-character": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-util-combine-extensions": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-combine-extensions/-/micromark-util-combine-extensions-2.0.1.tgz", - "integrity": "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-chunked": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-util-decode-numeric-character-reference": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/micromark-util-decode-numeric-character-reference/-/micromark-util-decode-numeric-character-reference-2.0.2.tgz", - "integrity": "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-symbol": "^2.0.0" - } - }, - "node_modules/micromark-util-decode-string": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-decode-string/-/micromark-util-decode-string-2.0.1.tgz", - "integrity": "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "decode-named-character-reference": "^1.0.0", - "micromark-util-character": "^2.0.0", - "micromark-util-decode-numeric-character-reference": "^2.0.0", - "micromark-util-symbol": "^2.0.0" - } - }, - "node_modules/micromark-util-encode": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-encode/-/micromark-util-encode-2.0.1.tgz", - "integrity": "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT" - }, - "node_modules/micromark-util-html-tag-name": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-html-tag-name/-/micromark-util-html-tag-name-2.0.1.tgz", - "integrity": "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT" - }, - "node_modules/micromark-util-normalize-identifier": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-normalize-identifier/-/micromark-util-normalize-identifier-2.0.1.tgz", - "integrity": "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-symbol": "^2.0.0" - } - }, - "node_modules/micromark-util-resolve-all": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-resolve-all/-/micromark-util-resolve-all-2.0.1.tgz", - "integrity": "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-util-sanitize-uri": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-2.0.1.tgz", - "integrity": "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "micromark-util-character": "^2.0.0", - "micromark-util-encode": "^2.0.0", - "micromark-util-symbol": "^2.0.0" - } - }, - "node_modules/micromark-util-subtokenize": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/micromark-util-subtokenize/-/micromark-util-subtokenize-2.0.4.tgz", - "integrity": "sha512-N6hXjrin2GTJDe3MVjf5FuXpm12PGm80BrUAeub9XFXca8JZbP+oIwY4LJSVwFUCL1IPm/WwSVUN7goFHmSGGQ==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT", - "dependencies": { - "devlop": "^1.0.0", - "micromark-util-chunked": "^2.0.0", - "micromark-util-symbol": "^2.0.0", - "micromark-util-types": "^2.0.0" - } - }, - "node_modules/micromark-util-symbol": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.1.tgz", - "integrity": "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT" - }, - "node_modules/micromark-util-types": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/micromark-util-types/-/micromark-util-types-2.0.1.tgz", - "integrity": "sha512-534m2WhVTddrcKVepwmVEVnUAmtrx9bfIjNoQHRqfnvdaHQiFytEhJoTgpWJvDEXCO5gLTQh3wYC1PgOJA4NSQ==", - "funding": [ - { - "type": "GitHub Sponsors", - "url": "https://github.com/sponsors/unifiedjs" - }, - { - "type": "OpenCollective", - "url": "https://opencollective.com/unified" - } - ], - "license": "MIT" - }, - "node_modules/micromatch": { - "version": "4.0.8", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", - "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", - "license": "MIT", - "dependencies": { - "braces": "^3.0.3", - "picomatch": "^2.3.1" - }, - "engines": { - "node": ">=8.6" - } - }, - "node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "license": "ISC", - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/minipass": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", - "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", - "license": "ISC", - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "license": "MIT" - }, - "node_modules/mz": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", - "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", - "license": "MIT", - "dependencies": { - "any-promise": "^1.0.0", - "object-assign": "^4.0.1", - "thenify-all": "^1.0.0" - } - }, - "node_modules/nanoid": { - "version": "3.3.8", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz", - "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "bin": { - "nanoid": "bin/nanoid.cjs" - }, - "engines": { - "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" - } - }, - "node_modules/natural-compare": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", - "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==", - "dev": true, - "license": "MIT" - }, - "node_modules/node-releases": { - "version": "2.0.19", - "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.19.tgz", - "integrity": "sha512-xxOWJsBKtzAq7DY0J+DTzuz58K8e7sJbdgwkbMWQe8UYB6ekmsQ45q0M/tJDsGaZmbC+l7n57UV8Hl5tHxO9uw==", - "license": "MIT" - }, - "node_modules/normalize-path": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", - "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/normalize-range": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz", - "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/object-hash": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz", - "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==", - "license": "MIT", - "engines": { - "node": ">= 6" - } - }, - "node_modules/optionator": { - "version": "0.9.4", - "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", - "integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==", - "dev": true, - "license": "MIT", - "dependencies": { - "deep-is": "^0.1.3", - "fast-levenshtein": "^2.0.6", - "levn": "^0.4.1", - "prelude-ls": "^1.2.1", - "type-check": "^0.4.0", - "word-wrap": "^1.2.5" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/p-limit": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", - "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "yocto-queue": "^0.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/p-locate": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", - "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", - "dev": true, - "license": "MIT", - "dependencies": { - "p-limit": "^3.0.2" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/package-json-from-dist": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", - "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", - "license": "BlueOak-1.0.0" - }, - "node_modules/parent-module": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", - "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==", - "dev": true, - "license": "MIT", - "dependencies": { - "callsites": "^3.0.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/parse-entities": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.2.tgz", - "integrity": "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==", - "license": "MIT", - "dependencies": { - "@types/unist": "^2.0.0", - "character-entities-legacy": "^3.0.0", - "character-reference-invalid": "^2.0.0", - "decode-named-character-reference": "^1.0.0", - "is-alphanumerical": "^2.0.0", - "is-decimal": "^2.0.0", - "is-hexadecimal": "^2.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/parse-entities/node_modules/@types/unist": { - "version": "2.0.11", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", - "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", - "license": "MIT" - }, - "node_modules/parse5": { - "version": "7.2.1", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", - "integrity": "sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==", - "license": "MIT", - "dependencies": { - "entities": "^4.5.0" - }, - "funding": { - "url": "https://github.com/inikulin/parse5?sponsor=1" - } - }, - "node_modules/path-exists": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", - "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/path-key": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", - "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/path-parse": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", - "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", - "license": "MIT" - }, - "node_modules/path-scurry": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", - "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", - "license": "BlueOak-1.0.0", - "dependencies": { - "lru-cache": "^10.2.0", - "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" - }, - "engines": { - "node": ">=16 || 14 >=14.18" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/path-scurry/node_modules/lru-cache": { - "version": "10.4.3", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", - "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", - "license": "ISC" - }, - "node_modules/picocolors": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", - "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", - "license": "ISC" - }, - "node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", - "license": "MIT", - "engines": { - "node": ">=8.6" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, - "node_modules/pify": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", - "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/pirates": { - "version": "4.0.6", - "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz", - "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==", - "license": "MIT", - "engines": { - "node": ">= 6" - } - }, - "node_modules/postcss": { - "version": "8.5.1", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.1.tgz", - "integrity": "sha512-6oz2beyjc5VMn/KV1pPw8fliQkhBXrVn1Z3TVyqZxU8kZpzEKhBdmCFqI6ZbmGtamQvQGuU1sgPTk8ZrXDD7jQ==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/postcss" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "nanoid": "^3.3.8", - "picocolors": "^1.1.1", - "source-map-js": "^1.2.1" - }, - "engines": { - "node": "^10 || ^12 || >=14" - } - }, - "node_modules/postcss-import": { - "version": "15.1.0", - "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz", - "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==", - "license": "MIT", - "dependencies": { - "postcss-value-parser": "^4.0.0", - "read-cache": "^1.0.0", - "resolve": "^1.1.7" - }, - "engines": { - "node": ">=14.0.0" - }, - "peerDependencies": { - "postcss": "^8.0.0" - } - }, - "node_modules/postcss-js": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.0.1.tgz", - "integrity": "sha512-dDLF8pEO191hJMtlHFPRa8xsizHaM82MLfNkUHdUtVEV3tgTp5oj+8qbEqYM57SLfc74KSbw//4SeJma2LRVIw==", - "license": "MIT", - "dependencies": { - "camelcase-css": "^2.0.1" - }, - "engines": { - "node": "^12 || ^14 || >= 16" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - "peerDependencies": { - "postcss": "^8.4.21" - } - }, - "node_modules/postcss-load-config": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-4.0.2.tgz", - "integrity": "sha512-bSVhyJGL00wMVoPUzAVAnbEoWyqRxkjv64tUl427SKnPrENtq6hJwUojroMz2VB+Q1edmi4IfrAPpami5VVgMQ==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "lilconfig": "^3.0.0", - "yaml": "^2.3.4" - }, - "engines": { - "node": ">= 14" - }, - "peerDependencies": { - "postcss": ">=8.0.9", - "ts-node": ">=9.0.0" - }, - "peerDependenciesMeta": { - "postcss": { - "optional": true - }, - "ts-node": { - "optional": true - } - } - }, - "node_modules/postcss-nested": { - "version": "6.2.0", - "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz", - "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "postcss-selector-parser": "^6.1.1" - }, - "engines": { - "node": ">=12.0" - }, - "peerDependencies": { - "postcss": "^8.2.14" - } - }, - "node_modules/postcss-selector-parser": { - "version": "6.1.2", - "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz", - "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==", - "license": "MIT", - "dependencies": { - "cssesc": "^3.0.0", - "util-deprecate": "^1.0.2" - }, - "engines": { - "node": ">=4" - } - }, - "node_modules/postcss-value-parser": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", - "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==", - "license": "MIT" - }, - "node_modules/prelude-ls": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", - "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/prettier": { - "version": "3.4.2", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.4.2.tgz", - "integrity": "sha512-e9MewbtFo+Fevyuxn/4rrcDAaq0IYxPGLvObpQjiZBMAzB9IGmzlnG9RZy3FFas+eBMu2vA0CszMeduow5dIuQ==", - "dev": true, - "license": "MIT", - "bin": { - "prettier": "bin/prettier.cjs" - }, - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/prettier/prettier?sponsor=1" - } - }, - "node_modules/property-information": { - "version": "6.5.0", - "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz", - "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/punycode": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", - "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/queue-microtask": { - "version": "1.2.3", - "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", - "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, - "node_modules/react": { - "version": "18.3.1", - "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", - "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", - "license": "MIT", - "dependencies": { - "loose-envify": "^1.1.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/react-dom": { - "version": "18.3.1", - "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", - "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", - "license": "MIT", - "dependencies": { - "loose-envify": "^1.1.0", - "scheduler": "^0.23.2" - }, - "peerDependencies": { - "react": "^18.3.1" - } - }, - "node_modules/react-markdown": { - "version": "9.0.3", - "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz", - "integrity": "sha512-Yk7Z94dbgYTOrdk41Z74GoKA7rThnsbbqBTRYuxoe08qvfQ9tJVhmAKw6BJS/ZORG7kTy/s1QvYzSuaoBA1qfw==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "devlop": "^1.0.0", - "hast-util-to-jsx-runtime": "^2.0.0", - "html-url-attributes": "^3.0.0", - "mdast-util-to-hast": "^13.0.0", - "remark-parse": "^11.0.0", - "remark-rehype": "^11.0.0", - "unified": "^11.0.0", - "unist-util-visit": "^5.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - }, - "peerDependencies": { - "@types/react": ">=18", - "react": ">=18" - } - }, - "node_modules/react-refresh": { - "version": "0.14.2", - "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.14.2.tgz", - "integrity": "sha512-jCvmsr+1IUSMUyzOkRcvnVbX3ZYC6g9TDrDbFuFmRDq7PD4yaGbLKNQL6k2jnArV8hjYxh7hVhAZB6s9HDGpZA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/react-router": { - "version": "7.1.5", - "resolved": "https://registry.npmjs.org/react-router/-/react-router-7.1.5.tgz", - "integrity": "sha512-8BUF+hZEU4/z/JD201yK6S+UYhsf58bzYIDq2NS1iGpwxSXDu7F+DeGSkIXMFBuHZB21FSiCzEcUb18cQNdRkA==", - "license": "MIT", - "dependencies": { - "@types/cookie": "^0.6.0", - "cookie": "^1.0.1", - "set-cookie-parser": "^2.6.0", - "turbo-stream": "2.4.0" - }, - "engines": { - "node": ">=20.0.0" - }, - "peerDependencies": { - "react": ">=18", - "react-dom": ">=18" - }, - "peerDependenciesMeta": { - "react-dom": { - "optional": true - } - } - }, - "node_modules/read-cache": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz", - "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==", - "license": "MIT", - "dependencies": { - "pify": "^2.3.0" - } - }, - "node_modules/readdirp": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", - "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", - "license": "MIT", - "dependencies": { - "picomatch": "^2.2.1" - }, - "engines": { - "node": ">=8.10.0" - } - }, - "node_modules/rehype-highlight": { - "version": "7.0.2", - "resolved": "https://registry.npmjs.org/rehype-highlight/-/rehype-highlight-7.0.2.tgz", - "integrity": "sha512-k158pK7wdC2qL3M5NcZROZ2tR/l7zOzjxXd5VGdcfIyoijjQqpHd3JKtYSBDpDZ38UI2WJWuFAtkMDxmx5kstA==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "hast-util-to-text": "^4.0.0", - "lowlight": "^3.0.0", - "unist-util-visit": "^5.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/rehype-katex": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/rehype-katex/-/rehype-katex-7.0.1.tgz", - "integrity": "sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/katex": "^0.16.0", - "hast-util-from-html-isomorphic": "^2.0.0", - "hast-util-to-text": "^4.0.0", - "katex": "^0.16.0", - "unist-util-visit-parents": "^6.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/remark-breaks": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/remark-breaks/-/remark-breaks-4.0.0.tgz", - "integrity": "sha512-IjEjJOkH4FuJvHZVIW0QCDWxcG96kCq7An/KVH2NfJe6rKZU2AsHeB3OEjPNRxi4QC34Xdx7I2KGYn6IpT7gxQ==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-newline-to-break": "^2.0.0", - "unified": "^11.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/remark-gfm": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.0.tgz", - "integrity": "sha512-U92vJgBPkbw4Zfu/IiW2oTZLSL3Zpv+uI7My2eq8JxKgqraFdU8YUGicEJCEgSbeaG+QDFqIcwwfMTOEelPxuA==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-gfm": "^3.0.0", - "micromark-extension-gfm": "^3.0.0", - "remark-parse": "^11.0.0", - "remark-stringify": "^11.0.0", - "unified": "^11.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/remark-math": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/remark-math/-/remark-math-6.0.0.tgz", - "integrity": "sha512-MMqgnP74Igy+S3WwnhQ7kqGlEerTETXMvJhrUzDikVZ2/uogJCb+WHUg97hK9/jcfc0dkD73s3LN8zU49cTEtA==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-math": "^3.0.0", - "micromark-extension-math": "^3.0.0", - "unified": "^11.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/remark-parse": { - "version": "11.0.0", - "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", - "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-from-markdown": "^2.0.0", - "micromark-util-types": "^2.0.0", - "unified": "^11.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/remark-rehype": { - "version": "11.1.1", - "resolved": "https://registry.npmjs.org/remark-rehype/-/remark-rehype-11.1.1.tgz", - "integrity": "sha512-g/osARvjkBXb6Wo0XvAeXQohVta8i84ACbenPpoSsxTOQH/Ae0/RGP4WZgnMH5pMLpsj4FG7OHmcIcXxpza8eQ==", - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/mdast": "^4.0.0", - "mdast-util-to-hast": "^13.0.0", - "unified": "^11.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/remark-stringify": { - "version": "11.0.0", - "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", - "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", - "license": "MIT", - "dependencies": { - "@types/mdast": "^4.0.0", - "mdast-util-to-markdown": "^2.0.0", - "unified": "^11.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/resolve": { - "version": "1.22.10", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.10.tgz", - "integrity": "sha512-NPRy+/ncIMeDlTAsuqwKIiferiawhefFJtkNSW0qZJEqMEb+qBt/77B/jGeeek+F0uOeN05CDa6HXbbIgtVX4w==", - "license": "MIT", - "dependencies": { - "is-core-module": "^2.16.0", - "path-parse": "^1.0.7", - "supports-preserve-symlinks-flag": "^1.0.0" - }, - "bin": { - "resolve": "bin/resolve" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/resolve-from": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", - "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=4" - } - }, - "node_modules/reusify": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", - "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", - "license": "MIT", - "engines": { - "iojs": ">=1.0.0", - "node": ">=0.10.0" - } - }, - "node_modules/rollup": { - "version": "4.34.2", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.34.2.tgz", - "integrity": "sha512-sBDUoxZEaqLu9QeNalL8v3jw6WjPku4wfZGyTU7l7m1oC+rpRihXc/n/H+4148ZkGz5Xli8CHMns//fFGKvpIQ==", - "license": "MIT", - "dependencies": { - "@types/estree": "1.0.6" - }, - "bin": { - "rollup": "dist/bin/rollup" - }, - "engines": { - "node": ">=18.0.0", - "npm": ">=8.0.0" - }, - "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.34.2", - "@rollup/rollup-android-arm64": "4.34.2", - "@rollup/rollup-darwin-arm64": "4.34.2", - "@rollup/rollup-darwin-x64": "4.34.2", - "@rollup/rollup-freebsd-arm64": "4.34.2", - "@rollup/rollup-freebsd-x64": "4.34.2", - "@rollup/rollup-linux-arm-gnueabihf": "4.34.2", - "@rollup/rollup-linux-arm-musleabihf": "4.34.2", - "@rollup/rollup-linux-arm64-gnu": "4.34.2", - "@rollup/rollup-linux-arm64-musl": "4.34.2", - "@rollup/rollup-linux-loongarch64-gnu": "4.34.2", - "@rollup/rollup-linux-powerpc64le-gnu": "4.34.2", - "@rollup/rollup-linux-riscv64-gnu": "4.34.2", - "@rollup/rollup-linux-s390x-gnu": "4.34.2", - "@rollup/rollup-linux-x64-gnu": "4.34.2", - "@rollup/rollup-linux-x64-musl": "4.34.2", - "@rollup/rollup-win32-arm64-msvc": "4.34.2", - "@rollup/rollup-win32-ia32-msvc": "4.34.2", - "@rollup/rollup-win32-x64-msvc": "4.34.2", - "fsevents": "~2.3.2" - } - }, - "node_modules/run-parallel": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", - "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT", - "dependencies": { - "queue-microtask": "^1.2.2" - } - }, - "node_modules/rxjs": { - "version": "7.8.1", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", - "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", - "devOptional": true, - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.1.0" - } - }, - "node_modules/sass-embedded": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.4.tgz", - "integrity": "sha512-Hf2burRA/y5PGxsg6jB9UpoK/xZ6g/pgrkOcdl6j+rRg1Zj8XhGKZ1MTysZGtTPUUmiiErqzkP5+Kzp95yv9GQ==", - "devOptional": true, - "license": "MIT", - "dependencies": { - "@bufbuild/protobuf": "^2.0.0", - "buffer-builder": "^0.2.0", - "colorjs.io": "^0.5.0", - "immutable": "^5.0.2", - "rxjs": "^7.4.0", - "supports-color": "^8.1.1", - "sync-child-process": "^1.0.2", - "varint": "^6.0.0" - }, - "bin": { - "sass": "dist/bin/sass.js" - }, - "engines": { - "node": ">=16.0.0" - }, - "optionalDependencies": { - "sass-embedded-android-arm": "1.83.4", - "sass-embedded-android-arm64": "1.83.4", - "sass-embedded-android-ia32": "1.83.4", - "sass-embedded-android-riscv64": "1.83.4", - "sass-embedded-android-x64": "1.83.4", - "sass-embedded-darwin-arm64": "1.83.4", - "sass-embedded-darwin-x64": "1.83.4", - "sass-embedded-linux-arm": "1.83.4", - "sass-embedded-linux-arm64": "1.83.4", - "sass-embedded-linux-ia32": "1.83.4", - "sass-embedded-linux-musl-arm": "1.83.4", - "sass-embedded-linux-musl-arm64": "1.83.4", - "sass-embedded-linux-musl-ia32": "1.83.4", - "sass-embedded-linux-musl-riscv64": "1.83.4", - "sass-embedded-linux-musl-x64": "1.83.4", - "sass-embedded-linux-riscv64": "1.83.4", - "sass-embedded-linux-x64": "1.83.4", - "sass-embedded-win32-arm64": "1.83.4", - "sass-embedded-win32-ia32": "1.83.4", - "sass-embedded-win32-x64": "1.83.4" - } - }, - "node_modules/sass-embedded-android-arm": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.4.tgz", - "integrity": "sha512-9Z4pJAOgEkXa3VDY/o+U6l5XvV0mZTJcSl0l/mSPHihjAHSpLYnOW6+KOWeM8dxqrsqTYcd6COzhanI/a++5Gw==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-android-arm64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.4.tgz", - "integrity": "sha512-tgX4FzmbVqnQmD67ZxQDvI+qFNABrboOQgwsG05E5bA/US42zGajW9AxpECJYiMXVOHmg+d81ICbjb0fsVHskw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-android-ia32": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.4.tgz", - "integrity": "sha512-RsFOziFqPcfZXdFRULC4Ayzy9aK6R6FwQ411broCjlOBX+b0gurjRadkue3cfUEUR5mmy0KeCbp7zVKPLTK+5Q==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-android-riscv64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.4.tgz", - "integrity": "sha512-EHwh0nmQarBBrMRU928eTZkFGx19k/XW2YwbPR4gBVdWLkbTgCA5aGe8hTE6/1zStyx++3nDGvTZ78+b/VvvLg==", - "cpu": [ - "riscv64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-android-x64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.4.tgz", - "integrity": "sha512-0PgQNuPWYy1jEOEPDVsV89KfqOsMLIp9CSbjBY7jRcwRhyVAcigqrUG6bDeNtojHUYKA1kU+Eh/85WxOHUOgBw==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-darwin-arm64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.4.tgz", - "integrity": "sha512-rp2ywymWc3nymnSnAFG5R/8hvxWCsuhK3wOnD10IDlmNB7o4rzKby1c+2ZfpQGowlYGWsWWTgz8FW2qzmZsQRw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-darwin-x64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.4.tgz", - "integrity": "sha512-kLkN2lXz9PCgGfDS8Ev5YVcl/V2173L6379en/CaFuJJi7WiyPgBymW7hOmfCt4uO4R1y7CP2Uc08DRtZsBlAA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-arm": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.4.tgz", - "integrity": "sha512-nL90ryxX2lNmFucr9jYUyHHx21AoAgdCL1O5Ltx2rKg2xTdytAGHYo2MT5S0LIeKLa/yKP/hjuSvrbICYNDvtA==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-arm64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.4.tgz", - "integrity": "sha512-E0zjsZX2HgESwyqw31EHtI39DKa7RgK7nvIhIRco1d0QEw227WnoR9pjH3M/ZQy4gQj3GKilOFHM5Krs/omeIA==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-ia32": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.4.tgz", - "integrity": "sha512-ew5HpchSzgAYbQoriRh8QhlWn5Kw2nQ2jHoV9YLwGKe3fwwOWA0KDedssvDv7FWnY/FCqXyymhLd6Bxae4Xquw==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-musl-arm": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.4.tgz", - "integrity": "sha512-0RrJRwMrmm+gG0VOB5b5Cjs7Sd+lhqpQJa6EJNEaZHljJokEfpE5GejZsGMRMIQLxEvVphZnnxl6sonCGFE/QQ==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-musl-arm64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.4.tgz", - "integrity": "sha512-IzMgalf6MZOxgp4AVCgsaWAFDP/IVWOrgVXxkyhw29fyAEoSWBJH4k87wyPhEtxSuzVHLxKNbc8k3UzdWmlBFg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-musl-ia32": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.4.tgz", - "integrity": "sha512-LLb4lYbcxPzX4UaJymYXC+WwokxUlfTJEFUv5VF0OTuSsHAGNRs/rslPtzVBTvMeG9TtlOQDhku1F7G6iaDotA==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-musl-riscv64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.4.tgz", - "integrity": "sha512-zoKlPzD5Z13HKin1UGR74QkEy+kZEk2AkGX5RelRG494mi+IWwRuWCppXIovor9+BQb9eDWPYPoMVahwN5F7VA==", - "cpu": [ - "riscv64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-musl-x64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.4.tgz", - "integrity": "sha512-hB8+/PYhfEf2zTIcidO5Bpof9trK6WJjZ4T8g2MrxQh8REVtdPcgIkoxczRynqybf9+fbqbUwzXtiUao2GV+vQ==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-riscv64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.4.tgz", - "integrity": "sha512-83fL4n+oeDJ0Y4KjASmZ9jHS1Vl9ESVQYHMhJE0i4xDi/P3BNarm2rsKljq/QtrwGpbqwn8ujzOu7DsNCMDSHA==", - "cpu": [ - "riscv64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-linux-x64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.4.tgz", - "integrity": "sha512-NlnGdvCmTD5PK+LKXlK3sAuxOgbRIEoZfnHvxd157imCm/s2SYF/R28D0DAAjEViyI8DovIWghgbcqwuertXsA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-win32-arm64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.4.tgz", - "integrity": "sha512-J2BFKrEaeSrVazU2qTjyQdAk+MvbzJeTuCET0uAJEXSKtvQ3AzxvzndS7LqkDPbF32eXAHLw8GVpwcBwKbB3Uw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-win32-ia32": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.4.tgz", - "integrity": "sha512-uPAe9T/5sANFhJS5dcfAOhOJy8/l2TRYG4r+UO3Wp4yhqbN7bggPvY9c7zMYS0OC8tU/bCvfYUDFHYMCl91FgA==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded-win32-x64": { - "version": "1.83.4", - "resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.4.tgz", - "integrity": "sha512-C9fkDY0jKITdJFij4UbfPFswxoXN9O/Dr79v17fJnstVwtUojzVJWKHUXvF0Zg2LIR7TCc4ju3adejKFxj7ueA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/sass-embedded/node_modules/supports-color": { - "version": "8.1.1", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", - "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", - "devOptional": true, - "license": "MIT", - "dependencies": { - "has-flag": "^4.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/supports-color?sponsor=1" - } - }, - "node_modules/scheduler": { - "version": "0.23.2", - "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz", - "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==", - "license": "MIT", - "dependencies": { - "loose-envify": "^1.1.0" - } - }, - "node_modules/semver": { - "version": "6.3.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", - "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", - "dev": true, - "license": "ISC", - "bin": { - "semver": "bin/semver.js" - } - }, - "node_modules/set-cookie-parser": { - "version": "2.7.1", - "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz", - "integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==", - "license": "MIT" - }, - "node_modules/shebang-command": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", - "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", - "license": "MIT", - "dependencies": { - "shebang-regex": "^3.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/shebang-regex": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", - "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/signal-exit": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", - "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", - "license": "ISC", - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/source-map-js": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", - "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", - "license": "BSD-3-Clause", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/space-separated-tokens": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz", - "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/string-width": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", - "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", - "license": "MIT", - "dependencies": { - "eastasianwidth": "^0.2.0", - "emoji-regex": "^9.2.2", - "strip-ansi": "^7.0.1" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/string-width-cjs": { - "name": "string-width", - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "license": "MIT", - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/string-width-cjs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/string-width-cjs/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "license": "MIT" - }, - "node_modules/string-width-cjs/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "license": "MIT", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/stringify-entities": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz", - "integrity": "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==", - "license": "MIT", - "dependencies": { - "character-entities-html4": "^2.0.0", - "character-entities-legacy": "^3.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/strip-ansi": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", - "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", - "license": "MIT", - "dependencies": { - "ansi-regex": "^6.0.1" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/strip-ansi?sponsor=1" - } - }, - "node_modules/strip-ansi-cjs": { - "name": "strip-ansi", - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "license": "MIT", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/strip-ansi-cjs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/strip-json-comments": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", - "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/style-to-object": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.8.tgz", - "integrity": "sha512-xT47I/Eo0rwJmaXC4oilDGDWLohVhR6o/xAQcPQN8q6QBuZVL8qMYL85kLmST5cPjAorwvqIA4qXTRQoYHaL6g==", - "license": "MIT", - "dependencies": { - "inline-style-parser": "0.2.4" - } - }, - "node_modules/sucrase": { - "version": "3.35.0", - "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", - "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==", - "license": "MIT", - "dependencies": { - "@jridgewell/gen-mapping": "^0.3.2", - "commander": "^4.0.0", - "glob": "^10.3.10", - "lines-and-columns": "^1.1.6", - "mz": "^2.7.0", - "pirates": "^4.0.1", - "ts-interface-checker": "^0.1.9" - }, - "bin": { - "sucrase": "bin/sucrase", - "sucrase-node": "bin/sucrase-node" - }, - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/sucrase/node_modules/commander": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", - "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", - "license": "MIT", - "engines": { - "node": ">= 6" - } - }, - "node_modules/supports-color": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", - "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", - "dev": true, - "license": "MIT", - "dependencies": { - "has-flag": "^4.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/supports-preserve-symlinks-flag": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", - "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/sync-child-process": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz", - "integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==", - "devOptional": true, - "license": "MIT", - "dependencies": { - "sync-message-port": "^1.0.0" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/sync-message-port": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz", - "integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==", - "devOptional": true, - "license": "MIT", - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/tailwindcss": { - "version": "3.4.17", - "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.17.tgz", - "integrity": "sha512-w33E2aCvSDP0tW9RZuNXadXlkHXqFzSkQew/aIa2i/Sj8fThxwovwlXHSPXTbAHwEIhBFXAedUhP2tueAKP8Og==", - "license": "MIT", - "dependencies": { - "@alloc/quick-lru": "^5.2.0", - "arg": "^5.0.2", - "chokidar": "^3.6.0", - "didyoumean": "^1.2.2", - "dlv": "^1.1.3", - "fast-glob": "^3.3.2", - "glob-parent": "^6.0.2", - "is-glob": "^4.0.3", - "jiti": "^1.21.6", - "lilconfig": "^3.1.3", - "micromatch": "^4.0.8", - "normalize-path": "^3.0.0", - "object-hash": "^3.0.0", - "picocolors": "^1.1.1", - "postcss": "^8.4.47", - "postcss-import": "^15.1.0", - "postcss-js": "^4.0.1", - "postcss-load-config": "^4.0.2", - "postcss-nested": "^6.2.0", - "postcss-selector-parser": "^6.1.2", - "resolve": "^1.22.8", - "sucrase": "^3.35.0" - }, - "bin": { - "tailwind": "lib/cli.js", - "tailwindcss": "lib/cli.js" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/textlinestream": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz", - "integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==", - "license": "MIT" - }, - "node_modules/thenify": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", - "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", - "license": "MIT", - "dependencies": { - "any-promise": "^1.0.0" - } - }, - "node_modules/thenify-all": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", - "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", - "license": "MIT", - "dependencies": { - "thenify": ">= 3.1.0 < 4" - }, - "engines": { - "node": ">=0.8" - } - }, - "node_modules/to-regex-range": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", - "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", - "license": "MIT", - "dependencies": { - "is-number": "^7.0.0" - }, - "engines": { - "node": ">=8.0" - } - }, - "node_modules/trim-lines": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz", - "integrity": "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/trough": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz", - "integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/ts-api-utils": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.0.1.tgz", - "integrity": "sha512-dnlgjFSVetynI8nzgJ+qF62efpglpWRk8isUEWZGWlJYySCTD6aKvbUDu+zbPeDakk3bg5H4XpitHukgfL1m9w==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18.12" - }, - "peerDependencies": { - "typescript": ">=4.8.4" - } - }, - "node_modules/ts-interface-checker": { - "version": "0.1.13", - "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", - "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", - "license": "Apache-2.0" - }, - "node_modules/tslib": { - "version": "2.8.1", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", - "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", - "devOptional": true, - "license": "0BSD" - }, - "node_modules/turbo-stream": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/turbo-stream/-/turbo-stream-2.4.0.tgz", - "integrity": "sha512-FHncC10WpBd2eOmGwpmQsWLDoK4cqsA/UT/GqNoaKOQnT8uzhtCbg3EoUDMvqpOSAI0S26mr0rkjzbOO6S3v1g==", - "license": "ISC" - }, - "node_modules/type-check": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", - "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", - "dev": true, - "license": "MIT", - "dependencies": { - "prelude-ls": "^1.2.1" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/typescript": { - "version": "5.6.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz", - "integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==", - "dev": true, - "license": "Apache-2.0", - "bin": { - "tsc": "bin/tsc", - "tsserver": "bin/tsserver" - }, - "engines": { - "node": ">=14.17" - } - }, - "node_modules/typescript-eslint": { - "version": "8.23.0", - "resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.23.0.tgz", - "integrity": "sha512-/LBRo3HrXr5LxmrdYSOCvoAMm7p2jNizNfbIpCgvG4HMsnoprRUOce/+8VJ9BDYWW68rqIENE/haVLWPeFZBVQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/eslint-plugin": "8.23.0", - "@typescript-eslint/parser": "8.23.0", - "@typescript-eslint/utils": "8.23.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0", - "typescript": ">=4.8.4 <5.8.0" - } - }, - "node_modules/undici-types": { - "version": "6.20.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz", - "integrity": "sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==", - "devOptional": true, - "license": "MIT" - }, - "node_modules/unified": { - "version": "11.0.5", - "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz", - "integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "bail": "^2.0.0", - "devlop": "^1.0.0", - "extend": "^3.0.0", - "is-plain-obj": "^4.0.0", - "trough": "^2.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-find-after": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz", - "integrity": "sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "unist-util-is": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-is": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.0.tgz", - "integrity": "sha512-2qCTHimwdxLfz+YzdGfkqNlH0tLi9xjTnHddPmJwtIG9MGsdbutfTc4P+haPD7l7Cjxf/WZj+we5qfVPvvxfYw==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-position": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-5.0.0.tgz", - "integrity": "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-remove-position": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/unist-util-remove-position/-/unist-util-remove-position-5.0.0.tgz", - "integrity": "sha512-Hp5Kh3wLxv0PHj9m2yZhhLt58KzPtEYKQQ4yxfYFEO7EvHwzyDYnduhHnY1mDxoqr7VUwVuHXk9RXKIiYS1N8Q==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "unist-util-visit": "^5.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-stringify-position": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", - "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-visit": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.0.0.tgz", - "integrity": "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "unist-util-is": "^6.0.0", - "unist-util-visit-parents": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/unist-util-visit-parents": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.1.tgz", - "integrity": "sha512-L/PqWzfTP9lzzEa6CKs0k2nARxTdZduw3zyh8d2NVBnsyvHjSX4TWse388YrrQKbvI8w20fGjGlhgT96WwKykw==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "unist-util-is": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/update-browserslist-db": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.2.tgz", - "integrity": "sha512-PPypAm5qvlD7XMZC3BujecnaOxwhrtoFR+Dqkk5Aa/6DssiH0ibKoketaj9w8LP7Bont1rYeoV5plxD7RTEPRg==", - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/browserslist" - }, - { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/browserslist" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "escalade": "^3.2.0", - "picocolors": "^1.1.1" - }, - "bin": { - "update-browserslist-db": "cli.js" - }, - "peerDependencies": { - "browserslist": ">= 4.21.0" - } - }, - "node_modules/uri-js": { - "version": "4.4.1", - "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", - "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "punycode": "^2.1.0" - } - }, - "node_modules/util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", - "license": "MIT" - }, - "node_modules/varint": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz", - "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==", - "devOptional": true, - "license": "MIT" - }, - "node_modules/vfile": { - "version": "6.0.3", - "resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz", - "integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "vfile-message": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/vfile-location": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz", - "integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "vfile": "^6.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/vfile-message": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.2.tgz", - "integrity": "sha512-jRDZ1IMLttGj41KcZvlrYAaI3CfqpLpfpf+Mfig13viT6NKvRzWZ+lXz0Y5D60w6uJIBAOGq9mSHf0gktF0duw==", - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "unist-util-stringify-position": "^4.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/vite": { - "version": "6.0.11", - "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.11.tgz", - "integrity": "sha512-4VL9mQPKoHy4+FE0NnRE/kbY51TOfaknxAjt3fJbGJxhIpBZiqVzlZDEesWWsuREXHwNdAoOFZ9MkPEVXczHwg==", - "license": "MIT", - "dependencies": { - "esbuild": "^0.24.2", - "postcss": "^8.4.49", - "rollup": "^4.23.0" - }, - "bin": { - "vite": "bin/vite.js" - }, - "engines": { - "node": "^18.0.0 || ^20.0.0 || >=22.0.0" - }, - "funding": { - "url": "https://github.com/vitejs/vite?sponsor=1" - }, - "optionalDependencies": { - "fsevents": "~2.3.3" - }, - "peerDependencies": { - "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", - "jiti": ">=1.21.0", - "less": "*", - "lightningcss": "^1.21.0", - "sass": "*", - "sass-embedded": "*", - "stylus": "*", - "sugarss": "*", - "terser": "^5.16.0", - "tsx": "^4.8.1", - "yaml": "^2.4.2" - }, - "peerDependenciesMeta": { - "@types/node": { - "optional": true - }, - "jiti": { - "optional": true - }, - "less": { - "optional": true - }, - "lightningcss": { - "optional": true - }, - "sass": { - "optional": true - }, - "sass-embedded": { - "optional": true - }, - "stylus": { - "optional": true - }, - "sugarss": { - "optional": true - }, - "terser": { - "optional": true - }, - "tsx": { - "optional": true - }, - "yaml": { - "optional": true - } - } - }, - "node_modules/vite-plugin-singlefile": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/vite-plugin-singlefile/-/vite-plugin-singlefile-2.1.0.tgz", - "integrity": "sha512-7tJo+UgZABlKpY/nubth/wxJ4+pUGREPnEwNOknxwl2MM0zTvF14KTU4Ln1lc140gjLLV5mjDrvuoquU7OZqCg==", - "license": "MIT", - "dependencies": { - "micromatch": "^4.0.8" - }, - "engines": { - "node": ">18.0.0" - }, - "peerDependencies": { - "rollup": "^4.28.1", - "vite": "^5.4.11 || ^6.0.0" - } - }, - "node_modules/web-namespaces": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz", - "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - }, - "node_modules/which": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", - "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", - "license": "ISC", - "dependencies": { - "isexe": "^2.0.0" - }, - "bin": { - "node-which": "bin/node-which" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/word-wrap": { - "version": "1.2.5", - "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz", - "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/wrap-ansi": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", - "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", - "license": "MIT", - "dependencies": { - "ansi-styles": "^6.1.0", - "string-width": "^5.0.1", - "strip-ansi": "^7.0.1" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/wrap-ansi?sponsor=1" - } - }, - "node_modules/wrap-ansi-cjs": { - "name": "wrap-ansi", - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", - "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", - "license": "MIT", - "dependencies": { - "ansi-styles": "^4.0.0", - "string-width": "^4.1.0", - "strip-ansi": "^6.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/wrap-ansi?sponsor=1" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "license": "MIT" - }, - "node_modules/wrap-ansi-cjs/node_modules/string-width": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "license": "MIT", - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "license": "MIT", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/yallist": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", - "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", - "dev": true, - "license": "ISC" - }, - "node_modules/yaml": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.7.0.tgz", - "integrity": "sha512-+hSoy/QHluxmC9kCIJyL/uyFmLmc+e5CFR5Wa+bpIhIj85LVb9ZH2nVnqrHoSvKogwODv0ClqZkmiSSaIH5LTA==", - "license": "ISC", - "bin": { - "yaml": "bin.mjs" - }, - "engines": { - "node": ">= 14" - } - }, - "node_modules/yocto-queue": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", - "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/zwitch": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz", - "integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==", - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" - } - } - } -} diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json deleted file mode 100644 index 3be2b14de..000000000 --- a/examples/server/webui/package.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "name": "webui", - "private": true, - "version": "0.0.0", - "type": "module", - "scripts": { - "dev": "vite", - "build": "tsc -b && vite build", - "format": "eslint . && prettier --write .", - "lint": "eslint .", - "preview": "vite preview" - }, - "dependencies": { - "@heroicons/react": "^2.2.0", - "@sec-ant/readable-stream": "^0.6.0", - "@vscode/markdown-it-katex": "^1.1.1", - "autoprefixer": "^10.4.20", - "daisyui": "^4.12.14", - "highlight.js": "^11.10.0", - "katex": "^0.16.15", - "postcss": "^8.4.49", - "react": "^18.3.1", - "react-dom": "^18.3.1", - "react-markdown": "^9.0.3", - "react-router": "^7.1.5", - "rehype-highlight": "^7.0.2", - "rehype-katex": "^7.0.1", - "remark-breaks": "^4.0.0", - "remark-gfm": "^4.0.0", - "remark-math": "^6.0.0", - "tailwindcss": "^3.4.15", - "textlinestream": "^1.1.1", - "vite-plugin-singlefile": "^2.0.3" - }, - "devDependencies": { - "@eslint/js": "^9.17.0", - "@types/markdown-it": "^14.1.2", - "@types/node": "^22.13.1", - "@types/react": "^18.3.18", - "@types/react-dom": "^18.3.5", - "@vitejs/plugin-react": "^4.3.4", - "eslint": "^9.17.0", - "eslint-plugin-react-hooks": "^5.0.0", - "eslint-plugin-react-refresh": "^0.4.16", - "globals": "^15.14.0", - "prettier": "^3.4.2", - "sass-embedded": "^1.83.4", - "typescript": "~5.6.2", - "typescript-eslint": "^8.18.2", - "vite": "^6.0.5" - }, - "prettier": { - "trailingComma": "es5", - "tabWidth": 2, - "semi": true, - "singleQuote": true, - "bracketSameLine": false - } -} diff --git a/examples/server/webui/postcss.config.js b/examples/server/webui/postcss.config.js deleted file mode 100644 index 2e7af2b7f..000000000 --- a/examples/server/webui/postcss.config.js +++ /dev/null @@ -1,6 +0,0 @@ -export default { - plugins: { - tailwindcss: {}, - autoprefixer: {}, - }, -} diff --git a/examples/server/webui/public/demo-conversation.json b/examples/server/webui/public/demo-conversation.json deleted file mode 100644 index 338b4aea5..000000000 --- a/examples/server/webui/public/demo-conversation.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "demo": true, - "id": "conv-1734086746930", - "lastModified": 1734087548943, - "messages": [ - { - "id": 1734086764521, - "role": "user", - "content": "this is a demo conversation, used in dev mode" - }, - { - "id": 1734087548327, - "role": "assistant", - "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\n$2x + y = z$\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$", - "timings": { - "prompt_n": 1, - "prompt_ms": 28.923, - "predicted_n": 25, - "predicted_ms": 573.016 - } - }, - { - "id": 1734087548328, - "role": "user", - "content": "this is a demo conversation, used in dev mode" - }, - { - "id": 1734087548329, - "role": "assistant", - "content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```" - } - ] -} diff --git a/examples/server/webui/src/App.tsx b/examples/server/webui/src/App.tsx deleted file mode 100644 index 2ce734682..000000000 --- a/examples/server/webui/src/App.tsx +++ /dev/null @@ -1,47 +0,0 @@ -import { HashRouter, Outlet, Route, Routes } from 'react-router'; -import Header from './components/Header'; -import Sidebar from './components/Sidebar'; -import { AppContextProvider, useAppContext } from './utils/app.context'; -import ChatScreen from './components/ChatScreen'; -import SettingDialog from './components/SettingDialog'; - -function App() { - return ( - -
- - - }> - } /> - } /> - - - -
-
- ); -} - -function AppLayout() { - const { showSettings, setShowSettings } = useAppContext(); - return ( - <> - -
-
- -
- { - setShowSettings(false)} - /> - } - - ); -} - -export default App; diff --git a/examples/server/webui/src/Config.ts b/examples/server/webui/src/Config.ts deleted file mode 100644 index 779ed9bf7..000000000 --- a/examples/server/webui/src/Config.ts +++ /dev/null @@ -1,92 +0,0 @@ -import daisyuiThemes from 'daisyui/src/theming/themes'; -import { isNumeric } from './utils/misc'; - -export const isDev = import.meta.env.MODE === 'development'; - -// constants -export const BASE_URL = new URL('.', document.baseURI).href - .toString() - .replace(/\/$/, ''); - -export const CONFIG_DEFAULT = { - // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. - // Do not use nested objects, keep it single level. Prefix the key if you need to group them. - apiKey: '', - systemMessage: 'You are a helpful assistant.', - showTokensPerSecond: false, - showThoughtInProgress: false, - excludeThoughtOnReq: true, - // make sure these default values are in sync with `common.h` - samplers: 'edkypmxt', - temperature: 0.8, - dynatemp_range: 0.0, - dynatemp_exponent: 1.0, - top_k: 40, - top_p: 0.95, - min_p: 0.05, - xtc_probability: 0.0, - xtc_threshold: 0.1, - typical_p: 1.0, - repeat_last_n: 64, - repeat_penalty: 1.0, - presence_penalty: 0.0, - frequency_penalty: 0.0, - dry_multiplier: 0.0, - dry_base: 1.75, - dry_allowed_length: 2, - dry_penalty_last_n: -1, - max_tokens: -1, - custom: '', // custom json-stringified object - // experimental features - pyIntepreterEnabled: false, -}; -export const CONFIG_INFO: Record = { - apiKey: 'Set the API Key if you are using --api-key option for the server.', - systemMessage: 'The starting message that defines how model should behave.', - samplers: - 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature', - temperature: - 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.', - dynatemp_range: - 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.', - dynatemp_exponent: - 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.', - top_k: 'Keeps only k top tokens.', - top_p: - 'Limits tokens to those that together have a cumulative probability of at least p', - min_p: - 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.', - xtc_probability: - 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.', - xtc_threshold: - 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.', - typical_p: - 'Sorts and limits tokens based on the difference between log-probability and entropy.', - repeat_last_n: 'Last n tokens to consider for penalizing repetition', - repeat_penalty: - 'Controls the repetition of token sequences in the generated text', - presence_penalty: - 'Limits tokens based on whether they appear in the output or not.', - frequency_penalty: - 'Limits tokens based on how often they appear in the output.', - dry_multiplier: - 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.', - dry_base: - 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.', - dry_allowed_length: - 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.', - dry_penalty_last_n: - 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.', - max_tokens: 'The maximum number of token per output.', - custom: '', // custom json-stringified object -}; -// config keys having numeric value (i.e. temperature, top_k, top_p, etc) -export const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT) - .filter((e) => isNumeric(e[1])) - .map((e) => e[0]); -// list of themes supported by daisyui -export const THEMES = ['light', 'dark'] - // make sure light & dark are always at the beginning - .concat( - Object.keys(daisyuiThemes).filter((t) => t !== 'light' && t !== 'dark') - ); diff --git a/examples/server/webui/src/components/CanvasPyInterpreter.tsx b/examples/server/webui/src/components/CanvasPyInterpreter.tsx deleted file mode 100644 index c2707fe20..000000000 --- a/examples/server/webui/src/components/CanvasPyInterpreter.tsx +++ /dev/null @@ -1,195 +0,0 @@ -import { useEffect, useState } from 'react'; -import { useAppContext } from '../utils/app.context'; -import { OpenInNewTab, XCloseButton } from '../utils/common'; -import { CanvasType } from '../utils/types'; -import { PlayIcon, StopIcon } from '@heroicons/react/24/outline'; -import StorageUtils from '../utils/storage'; - -const canInterrupt = typeof SharedArrayBuffer === 'function'; - -// adapted from https://pyodide.org/en/stable/usage/webworker.html -const WORKER_CODE = ` -importScripts("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.js"); - -let stdOutAndErr = []; - -let pyodideReadyPromise = loadPyodide({ - stdout: (data) => stdOutAndErr.push(data), - stderr: (data) => stdOutAndErr.push(data), -}); - -let alreadySetBuff = false; - -self.onmessage = async (event) => { - stdOutAndErr = []; - - // make sure loading is done - const pyodide = await pyodideReadyPromise; - const { id, python, context, interruptBuffer } = event.data; - - if (interruptBuffer && !alreadySetBuff) { - pyodide.setInterruptBuffer(interruptBuffer); - alreadySetBuff = true; - } - - // Now load any packages we need, run the code, and send the result back. - await pyodide.loadPackagesFromImports(python); - - // make a Python dictionary with the data from content - const dict = pyodide.globals.get("dict"); - const globals = dict(Object.entries(context)); - try { - self.postMessage({ id, running: true }); - // Execute the python code in this context - const result = pyodide.runPython(python, { globals }); - self.postMessage({ result, id, stdOutAndErr }); - } catch (error) { - self.postMessage({ error: error.message, id }); - } - interruptBuffer[0] = 0; -}; -`; - -let worker: Worker; -const interruptBuffer = canInterrupt - ? new Uint8Array(new SharedArrayBuffer(1)) - : null; - -const startWorker = () => { - if (!worker) { - worker = new Worker( - URL.createObjectURL(new Blob([WORKER_CODE], { type: 'text/javascript' })) - ); - } -}; - -if (StorageUtils.getConfig().pyIntepreterEnabled) { - startWorker(); -} - -const runCodeInWorker = ( - pyCode: string, - callbackRunning: () => void -): { - donePromise: Promise; - interrupt: () => void; -} => { - startWorker(); - const id = Math.random() * 1e8; - const context = {}; - if (interruptBuffer) { - interruptBuffer[0] = 0; - } - - const donePromise = new Promise((resolve) => { - worker.onmessage = (event) => { - const { error, stdOutAndErr, running } = event.data; - if (id !== event.data.id) return; - if (running) { - callbackRunning(); - return; - } else if (error) { - resolve(error.toString()); - } else { - resolve(stdOutAndErr.join('\n')); - } - }; - worker.postMessage({ id, python: pyCode, context, interruptBuffer }); - }); - - const interrupt = () => { - console.log('Interrupting...'); - console.trace(); - if (interruptBuffer) { - interruptBuffer[0] = 2; - } - }; - - return { donePromise, interrupt }; -}; - -export default function CanvasPyInterpreter() { - const { canvasData, setCanvasData } = useAppContext(); - - const [code, setCode] = useState(canvasData?.content ?? ''); // copy to avoid direct mutation - const [running, setRunning] = useState(false); - const [output, setOutput] = useState(''); - const [interruptFn, setInterruptFn] = useState<() => void>(); - const [showStopBtn, setShowStopBtn] = useState(false); - - const runCode = async (pycode: string) => { - interruptFn?.(); - setRunning(true); - setOutput('Loading Pyodide...'); - const { donePromise, interrupt } = runCodeInWorker(pycode, () => { - setOutput('Running...'); - setShowStopBtn(canInterrupt); - }); - setInterruptFn(() => interrupt); - const out = await donePromise; - setOutput(out); - setRunning(false); - setShowStopBtn(false); - }; - - // run code on mount - useEffect(() => { - setCode(canvasData?.content ?? ''); - runCode(canvasData?.content ?? ''); - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [canvasData?.content]); - - if (canvasData?.type !== CanvasType.PY_INTERPRETER) { - return null; - } - - return ( -
-
-
- Python Interpreter - setCanvasData(null)} - /> -
-
- -
-
- - {showStopBtn && ( - - )} - - - Report a bug - - -
- -
-
-
-
- ); -} diff --git a/examples/server/webui/src/components/ChatMessage.tsx b/examples/server/webui/src/components/ChatMessage.tsx deleted file mode 100644 index ec72196ba..000000000 --- a/examples/server/webui/src/components/ChatMessage.tsx +++ /dev/null @@ -1,235 +0,0 @@ -import { useMemo, useState } from 'react'; -import { useAppContext } from '../utils/app.context'; -import { Message, PendingMessage } from '../utils/types'; -import { classNames } from '../utils/misc'; -import MarkdownDisplay, { CopyButton } from './MarkdownDisplay'; - -interface SplitMessage { - content: PendingMessage['content']; - thought?: string; - isThinking?: boolean; -} - -export default function ChatMessage({ - msg, - id, - scrollToBottom, - isPending, -}: { - msg: Message | PendingMessage; - id?: string; - scrollToBottom: (requiresNearBottom: boolean) => void; - isPending?: boolean; -}) { - const { viewingConversation, replaceMessageAndGenerate, config } = - useAppContext(); - const [editingContent, setEditingContent] = useState(null); - const timings = useMemo( - () => - msg.timings - ? { - ...msg.timings, - prompt_per_second: - (msg.timings.prompt_n / msg.timings.prompt_ms) * 1000, - predicted_per_second: - (msg.timings.predicted_n / msg.timings.predicted_ms) * 1000, - } - : null, - [msg.timings] - ); - - // for reasoning model, we split the message into content and thought - // TODO: implement this as remark/rehype plugin in the future - const { content, thought, isThinking }: SplitMessage = useMemo(() => { - if (msg.content === null || msg.role !== 'assistant') { - return { content: msg.content }; - } - let actualContent = ''; - let thought = ''; - let isThinking = false; - let thinkSplit = msg.content.split('', 2); - actualContent += thinkSplit[0]; - while (thinkSplit[1] !== undefined) { - // tag found - thinkSplit = thinkSplit[1].split('', 2); - thought += thinkSplit[0]; - isThinking = true; - if (thinkSplit[1] !== undefined) { - // closing tag found - isThinking = false; - thinkSplit = thinkSplit[1].split('', 2); - actualContent += thinkSplit[0]; - } - } - return { content: actualContent, thought, isThinking }; - }, [msg]); - - if (!viewingConversation) return null; - - const regenerate = async () => { - replaceMessageAndGenerate(viewingConversation.id, msg.id, undefined, () => - scrollToBottom(true) - ); - }; - - return ( -
-
-
- {/* textarea for editing message */} - {editingContent !== null && ( - <> - -
- - - - )} - {/* not editing content, render message */} - {editingContent === null && ( - <> - {content === null ? ( - <> - {/* show loading dots for pending message */} - - - ) : ( - <> - {/* render message as markdown */} -
- {thought && ( -
- - {isPending && isThinking ? ( - - - Thinking - - ) : ( - Thought Process - )} - -
- -
-
- )} - -
- - )} - {/* render timings if enabled */} - {timings && config.showTokensPerSecond && ( -
-
- Speed: {timings.predicted_per_second.toFixed(1)} t/s -
-
- Prompt -
- Tokens: {timings.prompt_n} -
- Time: {timings.prompt_ms} ms -
- Speed: {timings.prompt_per_second.toFixed(1)} t/s -
- Generation -
- Tokens: {timings.predicted_n} -
- Time: {timings.predicted_ms} ms -
- Speed: {timings.predicted_per_second.toFixed(1)} t/s -
-
-
- )} - - )} -
-
- - {/* actions for each message */} - {msg.content !== null && ( -
- {/* user message */} - {msg.role === 'user' && ( - - )} - {/* assistant message */} - {msg.role === 'assistant' && ( - <> - {!isPending && ( - - )} - - - )} -
- )} -
- ); -} diff --git a/examples/server/webui/src/components/ChatScreen.tsx b/examples/server/webui/src/components/ChatScreen.tsx deleted file mode 100644 index dbc683ed1..000000000 --- a/examples/server/webui/src/components/ChatScreen.tsx +++ /dev/null @@ -1,146 +0,0 @@ -import { useEffect, useState } from 'react'; -import { useAppContext } from '../utils/app.context'; -import StorageUtils from '../utils/storage'; -import { useNavigate } from 'react-router'; -import ChatMessage from './ChatMessage'; -import { CanvasType, PendingMessage } from '../utils/types'; -import { classNames } from '../utils/misc'; -import CanvasPyInterpreter from './CanvasPyInterpreter'; - -export default function ChatScreen() { - const { - viewingConversation, - sendMessage, - isGenerating, - stopGenerating, - pendingMessages, - canvasData, - } = useAppContext(); - const [inputMsg, setInputMsg] = useState(''); - const navigate = useNavigate(); - - const currConvId = viewingConversation?.id ?? ''; - const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId]; - - const scrollToBottom = (requiresNearBottom: boolean) => { - const mainScrollElem = document.getElementById('main-scroll'); - if (!mainScrollElem) return; - const spaceToBottom = - mainScrollElem.scrollHeight - - mainScrollElem.scrollTop - - mainScrollElem.clientHeight; - if (!requiresNearBottom || spaceToBottom < 50) { - setTimeout( - () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }), - 1 - ); - } - }; - - // scroll to bottom when conversation changes - useEffect(() => { - scrollToBottom(false); - }, [viewingConversation?.id]); - - const sendNewMessage = async () => { - if (inputMsg.trim().length === 0 || isGenerating(currConvId)) return; - const convId = viewingConversation?.id ?? StorageUtils.getNewConvId(); - const lastInpMsg = inputMsg; - setInputMsg(''); - if (!viewingConversation) { - // if user is creating a new conversation, redirect to the new conversation - navigate(`/chat/${convId}`); - } - scrollToBottom(false); - // auto scroll as message is being generated - const onChunk = () => scrollToBottom(true); - if (!(await sendMessage(convId, inputMsg, onChunk))) { - // restore the input message if failed - setInputMsg(lastInpMsg); - } - }; - - const hasCanvas = !!canvasData; - - return ( -
-
- {/* chat messages */} -
-
- {/* placeholder to shift the message to the bottom */} - {viewingConversation ? '' : 'Send a message to start'} -
- {viewingConversation?.messages.map((msg) => ( - - ))} - - {pendingMsg && ( - - )} -
- - {/* chat input */} -
- - {isGenerating(currConvId) ? ( - - ) : ( - - )} -
-
-
- {canvasData?.type === CanvasType.PY_INTERPRETER && ( - - )} -
-
- ); -} diff --git a/examples/server/webui/src/components/Header.tsx b/examples/server/webui/src/components/Header.tsx deleted file mode 100644 index 505350313..000000000 --- a/examples/server/webui/src/components/Header.tsx +++ /dev/null @@ -1,176 +0,0 @@ -import { useEffect, useState } from 'react'; -import StorageUtils from '../utils/storage'; -import { useAppContext } from '../utils/app.context'; -import { classNames } from '../utils/misc'; -import daisyuiThemes from 'daisyui/src/theming/themes'; -import { THEMES } from '../Config'; -import { useNavigate } from 'react-router'; - -export default function Header() { - const navigate = useNavigate(); - const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme()); - const { setShowSettings } = useAppContext(); - - const setTheme = (theme: string) => { - StorageUtils.setTheme(theme); - setSelectedTheme(theme); - }; - - useEffect(() => { - document.body.setAttribute('data-theme', selectedTheme); - document.body.setAttribute( - 'data-color-scheme', - // @ts-expect-error daisyuiThemes complains about index type, but it should work - daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto' - ); - }, [selectedTheme]); - - const { isGenerating, viewingConversation } = useAppContext(); - const isCurrConvGenerating = isGenerating(viewingConversation?.id ?? ''); - - const removeConversation = () => { - if (isCurrConvGenerating || !viewingConversation) return; - const convId = viewingConversation.id; - if (window.confirm('Are you sure to delete this conversation?')) { - StorageUtils.remove(convId); - navigate('/'); - } - }; - - const downloadConversation = () => { - if (isCurrConvGenerating || !viewingConversation) return; - const convId = viewingConversation.id; - const conversationJson = JSON.stringify(viewingConversation, null, 2); - const blob = new Blob([conversationJson], { type: 'application/json' }); - const url = URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = `conversation_${convId}.json`; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - URL.revokeObjectURL(url); - }; - - return ( -
- {/* open sidebar button */} - - -
llama.cpp
- - {/* action buttons (top right) */} -
-
- {/* "..." button */} - - {/* dropdown menu */} - -
-
- -
- - {/* theme controller is copied from https://daisyui.com/components/theme-controller/ */} -
-
-
- - - -
-
    -
  • - -
  • - {THEMES.map((theme) => ( -
  • - e.target.checked && setTheme(theme)} - /> -
  • - ))} -
-
-
-
-
- ); -} diff --git a/examples/server/webui/src/components/MarkdownDisplay.tsx b/examples/server/webui/src/components/MarkdownDisplay.tsx deleted file mode 100644 index 5b7a72591..000000000 --- a/examples/server/webui/src/components/MarkdownDisplay.tsx +++ /dev/null @@ -1,310 +0,0 @@ -import React, { useMemo, useState } from 'react'; -import Markdown, { ExtraProps } from 'react-markdown'; -import remarkGfm from 'remark-gfm'; -import rehypeHightlight from 'rehype-highlight'; -import rehypeKatex from 'rehype-katex'; -import remarkMath from 'remark-math'; -import remarkBreaks from 'remark-breaks'; -import 'katex/dist/katex.min.css'; -import { classNames, copyStr } from '../utils/misc'; -import { ElementContent, Root } from 'hast'; -import { visit } from 'unist-util-visit'; -import { useAppContext } from '../utils/app.context'; -import { CanvasType } from '../utils/types'; - -export default function MarkdownDisplay({ - content, - isGenerating, -}: { - content: string; - isGenerating?: boolean; -}) { - const preprocessedContent = useMemo( - () => preprocessLaTeX(content), - [content] - ); - return ( - ( - - ), - // note: do not use "pre", "p" or other basic html elements here, it will cause the node to re-render when the message is being generated (this should be a bug with react-markdown, not sure how to fix it) - }} - > - {preprocessedContent} - - ); -} - -const CodeBlockButtons: React.ElementType< - React.ClassAttributes & - React.HTMLAttributes & - ExtraProps & { origContent: string; isGenerating?: boolean } -> = ({ node, origContent, isGenerating }) => { - const { config } = useAppContext(); - const startOffset = node?.position?.start.offset ?? 0; - const endOffset = node?.position?.end.offset ?? 0; - - const copiedContent = useMemo( - () => - origContent - .substring(startOffset, endOffset) - .replace(/^```[^\n]+\n/g, '') - .replace(/```$/g, ''), - [origContent, startOffset, endOffset] - ); - - const codeLanguage = useMemo( - () => - origContent - .substring(startOffset, startOffset + 10) - .match(/^```([^\n]+)\n/)?.[1] ?? '', - [origContent, startOffset] - ); - - const canRunCode = - !isGenerating && - config.pyIntepreterEnabled && - codeLanguage.startsWith('py'); - - return ( -
- - {canRunCode && ( - - )} -
- ); -}; - -export const CopyButton = ({ - content, - className, -}: { - content: string; - className?: string; -}) => { - const [copied, setCopied] = useState(false); - return ( - - ); -}; - -export const RunPyCodeButton = ({ - content, - className, -}: { - content: string; - className?: string; -}) => { - const { setCanvasData } = useAppContext(); - return ( - <> - - - ); -}; - -/** - * This injects the "button" element before each "pre" element. - * The actual button will be replaced with a react component in the MarkdownDisplay. - * We don't replace "pre" node directly because it will cause the node to re-render, which causes this bug: https://github.com/ggerganov/llama.cpp/issues/9608 - */ -function rehypeCustomCopyButton() { - return function (tree: Root) { - visit(tree, 'element', function (node) { - if (node.tagName === 'pre' && !node.properties.visited) { - const preNode = { ...node }; - // replace current node - preNode.properties.visited = 'true'; - node.tagName = 'div'; - node.properties = {}; - // add node for button - const btnNode: ElementContent = { - type: 'element', - tagName: 'button', - properties: {}, - children: [], - position: node.position, - }; - node.children = [btnNode, preNode]; - } - }); - }; -} - -/** - * The part below is copied and adapted from: - * https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts - * (MIT License) - */ - -// Regex to check if the processed content contains any potential LaTeX patterns -const containsLatexRegex = - /\\\(.*?\\\)|\\\[.*?\\\]|\$.*?\$|\\begin\{equation\}.*?\\end\{equation\}/; - -// Regex for inline and block LaTeX expressions -const inlineLatex = new RegExp(/\\\((.+?)\\\)/, 'g'); -const blockLatex = new RegExp(/\\\[(.*?[^\\])\\\]/, 'gs'); - -// Function to restore code blocks -const restoreCodeBlocks = (content: string, codeBlocks: string[]) => { - return content.replace( - /<>/g, - (_, index) => codeBlocks[index] - ); -}; - -// Regex to identify code blocks and inline code -const codeBlockRegex = /(```[\s\S]*?```|`.*?`)/g; - -export const processLaTeX = (_content: string) => { - let content = _content; - // Temporarily replace code blocks and inline code with placeholders - const codeBlocks: string[] = []; - let index = 0; - content = content.replace(codeBlockRegex, (match) => { - codeBlocks[index] = match; - return `<>`; - }); - - // Escape dollar signs followed by a digit or space and digit - let processedContent = content.replace(/(\$)(?=\s?\d)/g, '\\$'); - - // If no LaTeX patterns are found, restore code blocks and return the processed content - if (!containsLatexRegex.test(processedContent)) { - return restoreCodeBlocks(processedContent, codeBlocks); - } - - // Convert LaTeX expressions to a markdown compatible format - processedContent = processedContent - .replace(inlineLatex, (_: string, equation: string) => `$${equation}$`) // Convert inline LaTeX - .replace(blockLatex, (_: string, equation: string) => `$$${equation}$$`); // Convert block LaTeX - - // Restore code blocks - return restoreCodeBlocks(processedContent, codeBlocks); -}; - -/** - * Preprocesses LaTeX content by replacing delimiters and escaping certain characters. - * - * @param content The input string containing LaTeX expressions. - * @returns The processed string with replaced delimiters and escaped characters. - */ -export function preprocessLaTeX(content: string): string { - // Step 1: Protect code blocks - const codeBlocks: string[] = []; - content = content.replace(/(```[\s\S]*?```|`[^`\n]+`)/g, (_, code) => { - codeBlocks.push(code); - return `<>`; - }); - - // Step 2: Protect existing LaTeX expressions - const latexExpressions: string[] = []; - - // Protect block math ($$...$$), \[...\], and \(...\) as before. - content = content.replace( - /(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g, - (match) => { - latexExpressions.push(match); - return `<>`; - } - ); - - // Protect inline math ($...$) only if it does NOT match a currency pattern. - // We assume a currency pattern is one where the inner content is purely numeric (with optional decimals). - content = content.replace(/\$([^$]+)\$/g, (match, inner) => { - if (/^\s*\d+(?:\.\d+)?\s*$/.test(inner)) { - // This looks like a currency value (e.g. "$123" or "$12.34"), - // so don't protect it. - return match; - } else { - // Otherwise, treat it as a LaTeX expression. - latexExpressions.push(match); - return `<>`; - } - }); - - // Step 3: Escape dollar signs that are likely currency indicators. - // (Now that inline math is protected, this will only escape dollars not already protected) - content = content.replace(/\$(?=\d)/g, '\\$'); - - // Step 4: Restore LaTeX expressions - content = content.replace( - /<>/g, - (_, index) => latexExpressions[parseInt(index)] - ); - - // Step 5: Restore code blocks - content = content.replace( - /<>/g, - (_, index) => codeBlocks[parseInt(index)] - ); - - // Step 6: Apply additional escaping functions - content = escapeBrackets(content); - content = escapeMhchem(content); - - return content; -} - -export function escapeBrackets(text: string): string { - const pattern = - /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g; - return text.replace( - pattern, - ( - match: string, - codeBlock: string | undefined, - squareBracket: string | undefined, - roundBracket: string | undefined - ): string => { - if (codeBlock != null) { - return codeBlock; - } else if (squareBracket != null) { - return `$$${squareBracket}$$`; - } else if (roundBracket != null) { - return `$${roundBracket}$`; - } - return match; - } - ); -} - -export function escapeMhchem(text: string) { - return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{'); -} diff --git a/examples/server/webui/src/components/SettingDialog.tsx b/examples/server/webui/src/components/SettingDialog.tsx deleted file mode 100644 index 592b93fa3..000000000 --- a/examples/server/webui/src/components/SettingDialog.tsx +++ /dev/null @@ -1,536 +0,0 @@ -import { useState } from 'react'; -import { useAppContext } from '../utils/app.context'; -import { CONFIG_DEFAULT, CONFIG_INFO } from '../Config'; -import { isDev } from '../Config'; -import StorageUtils from '../utils/storage'; -import { classNames, isBoolean, isNumeric, isString } from '../utils/misc'; -import { - BeakerIcon, - ChatBubbleOvalLeftEllipsisIcon, - Cog6ToothIcon, - FunnelIcon, - HandRaisedIcon, - SquaresPlusIcon, -} from '@heroicons/react/24/outline'; -import { OpenInNewTab } from '../utils/common'; - -type SettKey = keyof typeof CONFIG_DEFAULT; - -const BASIC_KEYS: SettKey[] = [ - 'temperature', - 'top_k', - 'top_p', - 'min_p', - 'max_tokens', -]; -const SAMPLER_KEYS: SettKey[] = [ - 'dynatemp_range', - 'dynatemp_exponent', - 'typical_p', - 'xtc_probability', - 'xtc_threshold', -]; -const PENALTY_KEYS: SettKey[] = [ - 'repeat_last_n', - 'repeat_penalty', - 'presence_penalty', - 'frequency_penalty', - 'dry_multiplier', - 'dry_base', - 'dry_allowed_length', - 'dry_penalty_last_n', -]; - -enum SettingInputType { - SHORT_INPUT, - LONG_INPUT, - CHECKBOX, - CUSTOM, -} - -interface SettingFieldInput { - type: Exclude; - label: string | React.ReactElement; - help?: string | React.ReactElement; - key: SettKey; -} - -interface SettingFieldCustom { - type: SettingInputType.CUSTOM; - key: SettKey; - component: - | string - | React.FC<{ - value: string | boolean | number; - onChange: (value: string) => void; - }>; -} - -interface SettingSection { - title: React.ReactElement; - fields: (SettingFieldInput | SettingFieldCustom)[]; -} - -const ICON_CLASSNAME = 'w-4 h-4 mr-1 inline'; - -const SETTING_SECTIONS: SettingSection[] = [ - { - title: ( - <> - - General - - ), - fields: [ - { - type: SettingInputType.SHORT_INPUT, - label: 'API Key', - key: 'apiKey', - }, - { - type: SettingInputType.LONG_INPUT, - label: 'System Message (will be disabled if left empty)', - key: 'systemMessage', - }, - ...BASIC_KEYS.map( - (key) => - ({ - type: SettingInputType.SHORT_INPUT, - label: key, - key, - }) as SettingFieldInput - ), - ], - }, - { - title: ( - <> - - Samplers - - ), - fields: [ - { - type: SettingInputType.SHORT_INPUT, - label: 'Samplers queue', - key: 'samplers', - }, - ...SAMPLER_KEYS.map( - (key) => - ({ - type: SettingInputType.SHORT_INPUT, - label: key, - key, - }) as SettingFieldInput - ), - ], - }, - { - title: ( - <> - - Penalties - - ), - fields: PENALTY_KEYS.map((key) => ({ - type: SettingInputType.SHORT_INPUT, - label: key, - key, - })), - }, - { - title: ( - <> - - Reasoning - - ), - fields: [ - { - type: SettingInputType.CHECKBOX, - label: 'Expand though process by default for generating message', - key: 'showThoughtInProgress', - }, - { - type: SettingInputType.CHECKBOX, - label: - 'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)', - key: 'excludeThoughtOnReq', - }, - ], - }, - { - title: ( - <> - - Advanced - - ), - fields: [ - { - type: SettingInputType.CUSTOM, - key: 'custom', // dummy key, won't be used - component: () => { - const debugImportDemoConv = async () => { - const res = await fetch('/demo-conversation.json'); - const demoConv = await res.json(); - StorageUtils.remove(demoConv.id); - for (const msg of demoConv.messages) { - StorageUtils.appendMsg(demoConv.id, msg); - } - }; - return ( - - ); - }, - }, - { - type: SettingInputType.CHECKBOX, - label: 'Show tokens per second', - key: 'showTokensPerSecond', - }, - { - type: SettingInputType.LONG_INPUT, - label: ( - <> - Custom JSON config (For more info, refer to{' '} - - server documentation - - ) - - ), - key: 'custom', - }, - ], - }, - { - title: ( - <> - - Experimental - - ), - fields: [ - { - type: SettingInputType.CUSTOM, - key: 'custom', // dummy key, won't be used - component: () => ( - <> -

- Experimental features are not guaranteed to work correctly. -
-
- If you encounter any problems, create a{' '} - - Bug (misc.) - {' '} - report on Github. Please also specify webui/experimental on - the report title and include screenshots. -
-
- Some features may require packages downloaded from CDN, so they - need internet connection. -

- - ), - }, - { - type: SettingInputType.CHECKBOX, - label: ( - <> - Enable Python interpreter -
- - This feature uses{' '} - pyodide, - downloaded from CDN. To use this feature, ask the LLM to generate - python code inside a markdown code block. You will see a "Run" - button on the code block, near the "Copy" button. - - - ), - key: 'pyIntepreterEnabled', - }, - ], - }, -]; - -export default function SettingDialog({ - show, - onClose, -}: { - show: boolean; - onClose: () => void; -}) { - const { config, saveConfig } = useAppContext(); - const [sectionIdx, setSectionIdx] = useState(0); - - // clone the config object to prevent direct mutation - const [localConfig, setLocalConfig] = useState( - JSON.parse(JSON.stringify(config)) - ); - - const resetConfig = () => { - if (window.confirm('Are you sure to reset all settings?')) { - setLocalConfig(CONFIG_DEFAULT); - } - }; - - const handleSave = () => { - // copy the local config to prevent direct mutation - const newConfig: typeof CONFIG_DEFAULT = JSON.parse( - JSON.stringify(localConfig) - ); - // validate the config - for (const key in newConfig) { - const value = newConfig[key as SettKey]; - const mustBeBoolean = isBoolean(CONFIG_DEFAULT[key as SettKey]); - const mustBeString = isString(CONFIG_DEFAULT[key as SettKey]); - const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]); - if (mustBeString) { - if (!isString(value)) { - alert(`Value for ${key} must be string`); - return; - } - } else if (mustBeNumeric) { - const trimedValue = value.toString().trim(); - const numVal = Number(trimedValue); - if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) { - alert(`Value for ${key} must be numeric`); - return; - } - // force conversion to number - // @ts-expect-error this is safe - newConfig[key] = numVal; - } else if (mustBeBoolean) { - if (!isBoolean(value)) { - alert(`Value for ${key} must be boolean`); - return; - } - } else { - console.error(`Unknown default type for key ${key}`); - } - } - if (isDev) console.log('Saving config', newConfig); - saveConfig(newConfig); - onClose(); - }; - - const onChange = (key: SettKey) => (value: string | boolean) => { - // note: we do not perform validation here, because we may get incomplete value as user is still typing it - setLocalConfig({ ...localConfig, [key]: value }); - }; - - return ( - -
-

Settings

-
- {/* Left panel, showing sections - Desktop version */} -
- {SETTING_SECTIONS.map((section, idx) => ( -
setSectionIdx(idx)} - dir="auto" - > - {section.title} -
- ))} -
- - {/* Left panel, showing sections - Mobile version */} -
-
- - {SETTING_SECTIONS[sectionIdx].title} - -
    - {SETTING_SECTIONS.map((section, idx) => ( -
    setSectionIdx(idx)} - dir="auto" - > - {section.title} -
    - ))} -
-
-
- - {/* Right panel, showing setting fields */} -
- {SETTING_SECTIONS[sectionIdx].fields.map((field, idx) => { - const key = `${sectionIdx}-${idx}`; - if (field.type === SettingInputType.SHORT_INPUT) { - return ( - - ); - } else if (field.type === SettingInputType.LONG_INPUT) { - return ( - - ); - } else if (field.type === SettingInputType.CHECKBOX) { - return ( - - ); - } else if (field.type === SettingInputType.CUSTOM) { - return ( -
- {typeof field.component === 'string' - ? field.component - : field.component({ - value: localConfig[field.key], - onChange: onChange(field.key), - })} -
- ); - } - })} - -

- Settings are saved in browser's localStorage -

-
-
- -
- - - -
-
-
- ); -} - -function SettingsModalLongInput({ - configKey, - value, - onChange, - label, -}: { - configKey: SettKey; - value: string; - onChange: (value: string) => void; - label?: string; -}) { - return ( -