diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..45232b80e --- /dev/null +++ b/.clang-format @@ -0,0 +1,161 @@ +--- +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveAssignments: AcrossComments +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveDeclarations: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReferenceAlignment: Middle +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++17 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... + diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index d5acd35e2..05bff1bdf 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -26,7 +26,7 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) && \ cp build/bin/* . diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile index 34ba856d3..575e81b48 100644 --- a/.devops/full-musa.Dockerfile +++ b/.devops/full-musa.Dockerfile @@ -19,7 +19,7 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) && \ cp build/bin/* . diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index 45c0585b0..02dce501c 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH RUN echo "Building with static libs" && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ - cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release --target llama-cli # TODO: use image with NNRT diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index 3279c8da4..7796891d5 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -22,7 +22,7 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release --target llama-cli -j$(nproc) && \ mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index 1f4ce0730..0706f732a 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ echo "Building with static libs" && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release --target llama-cli diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile index 1edf75cf2..3372749be 100644 --- a/.devops/llama-cli-musa.Dockerfile +++ b/.devops/llama-cli-musa.Dockerfile @@ -15,7 +15,7 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release --target llama-cli -j$(nproc) && \ mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile index 9b0dad8bf..92a6e0479 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DGGML_VULKAN=1 && \ +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \ cmake --build build --config Release --target llama-cli # Clean up diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index ea07a4d52..bf8a198f9 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -22,7 +22,7 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release --target llama-server -j$(nproc) && \ mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index 773f030a7..b503b8cfe 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ echo "Building with dynamic libs" && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile index 259877468..eb67201c1 100644 --- a/.devops/llama-server-musa.Dockerfile +++ b/.devops/llama-server-musa.Dockerfile @@ -15,7 +15,7 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release --target llama-server -j$(nproc) && \ mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 93c5e0c26..6aa786779 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ cmake --build build --config Release --target llama-server # Clean up diff --git a/.gitignore b/.gitignore index 1092d097a..307c065f7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.a *.bat *.bin +*.d *.dll *.dot *.etag @@ -133,3 +134,7 @@ poetry.toml # Test models for lora adapters /lora-tests + +# Local scripts +/run-vim.sh +/run-chat.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 93c60ef43..994e61e45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,13 @@ if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) endif() +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + add_compile_options("$<$:/source-charset:utf-8>") + add_compile_options("$<$:/source-charset:utf-8>") + add_compile_options("$<$:/execution-charset:utf-8>") + add_compile_options("$<$:/execution-charset:utf-8>") +endif() + # # option list # diff --git a/Makefile b/Makefile index 804c1bbec..5c8994385 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,6 @@ TEST_TARGETS = \ tests/test-backend-ops \ tests/test-chat-template \ tests/test-double-float \ - tests/test-grad0 \ tests/test-grammar-integration \ tests/test-grammar-parser \ tests/test-json-schema-to-grammar \ @@ -527,11 +526,11 @@ ifndef GGML_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE - MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK - MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 - MK_LDFLAGS += -framework Accelerate - OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE + MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK + MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 + MK_LDFLAGS += -framework Accelerate + OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o endif endif # GGML_NO_ACCELERATE @@ -542,44 +541,44 @@ ifndef GGML_NO_OPENMP endif # GGML_NO_OPENMP ifdef GGML_OPENBLAS - MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) - MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) - MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) + MK_LDFLAGS += $(shell pkg-config --libs openblas) + OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o endif # GGML_OPENBLAS ifdef GGML_OPENBLAS64 - MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) - MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) - MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) + MK_LDFLAGS += $(shell pkg-config --libs openblas64) + OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o endif # GGML_OPENBLAS64 ifdef GGML_BLIS - MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis - MK_LDFLAGS += -lblis -L/usr/local/lib - OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis + MK_LDFLAGS += -lblis -L/usr/local/lib + OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o endif # GGML_BLIS ifdef GGML_NVPL - MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas - MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp - OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o + MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas + MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp + OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o endif # GGML_NVPL ifndef GGML_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML += ggml/src/ggml-cpu/llamafile/sgemm.o + MK_CPPFLAGS += -DGGML_USE_LLAMAFILE + OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o endif ifndef GGML_NO_AMX MK_CPPFLAGS += -DGGML_USE_AMX - OBJ_GGML += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o + OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o endif ifdef GGML_RPC - MK_CPPFLAGS += -DGGML_USE_RPC - OBJ_GGML += ggml/src/ggml-rpc.o + MK_CPPFLAGS += -DGGML_USE_RPC + OBJ_GGML_EXT += ggml/src/ggml-rpc.o endif # GGML_RPC OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) @@ -604,9 +603,9 @@ ifdef GGML_CUDA MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_NVCCFLAGS += -use_fast_math - OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML += $(OBJ_CUDA_TMPL) + OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o + OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings @@ -636,10 +635,6 @@ else ifndef CUDA_POWER_ARCH MK_NVCCFLAGS += -arch=native endif # CUDA_DOCKER_ARCH -ifdef GGML_CUDA_FORCE_DMMV - MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # GGML_CUDA_FORCE_DMMV - ifdef GGML_CUDA_FORCE_MMQ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ endif # GGML_CUDA_FORCE_MMQ @@ -648,20 +643,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS endif # GGML_CUDA_FORCE_CUBLAS -ifdef GGML_CUDA_DMMV_X - MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) -else - MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 -endif # GGML_CUDA_DMMV_X - -ifdef GGML_CUDA_MMV_Y - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) -else ifdef GGML_CUDA_DMMV_Y - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility -else - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 -endif # GGML_CUDA_MMV_Y - ifdef GGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_F16 @@ -670,12 +651,6 @@ ifdef GGML_CUDA_DMMV_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_DMMV_F16 -ifdef GGML_CUDA_KQUANTS_ITER - MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) -else - MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 -endif - ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) else @@ -723,9 +698,9 @@ ggml/src/ggml-cuda/ggml-cuda.o: \ endif # GGML_CUDA ifdef GGML_VULKAN - MK_CPPFLAGS += -DGGML_USE_VULKAN - MK_LDFLAGS += $(shell pkg-config --libs vulkan) - OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o + MK_CPPFLAGS += -DGGML_USE_VULKAN + MK_LDFLAGS += $(shell pkg-config --libs vulkan) + OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o ifdef GGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS @@ -755,10 +730,10 @@ GLSLC_CMD = glslc _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp -_ggml_vk_input_dir = ggml/src/vulkan-shaders +_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) -ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) +ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ $(_ggml_vk_header): $(_ggml_vk_source) @@ -770,8 +745,8 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen --target-hpp $(_ggml_vk_header) \ --target-cpp $(_ggml_vk_source) -vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp - $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp + $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp endif # GGML_VULKAN @@ -784,10 +759,6 @@ ifdef GGML_HIPBLAS AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif - GGML_CUDA_DMMV_X ?= 32 - GGML_CUDA_MMV_Y ?= 1 - GGML_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA ifdef GGML_HIP_UMA @@ -801,13 +772,6 @@ endif # GGML_HIP_UMA HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) - HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) - HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) - HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) - -ifdef GGML_CUDA_FORCE_DMMV - HIPFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # GGML_CUDA_FORCE_DMMV ifdef GGML_CUDA_FORCE_MMQ HIPFLAGS += -DGGML_CUDA_FORCE_MMQ @@ -821,9 +785,9 @@ ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY - OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML += $(OBJ_CUDA_TMPL) + OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o + OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) ggml/src/ggml-cuda/ggml-cuda.o: \ ggml/src/ggml-cuda/ggml-cuda.cu \ @@ -870,10 +834,6 @@ ifdef GGML_MUSA MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS)) -ifdef GGML_CUDA_FORCE_DMMV - MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # GGML_CUDA_FORCE_DMMV - ifdef GGML_CUDA_FORCE_MMQ MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ endif # GGML_CUDA_FORCE_MMQ @@ -882,18 +842,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS endif # GGML_CUDA_FORCE_CUBLAS -ifdef GGML_CUDA_DMMV_X - MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) -else - MUSAFLAGS += -DGGML_CUDA_DMMV_X=32 -endif # GGML_CUDA_DMMV_X - -ifdef GGML_CUDA_MMV_Y - MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) -else - MUSAFLAGS += -DGGML_CUDA_MMV_Y=1 -endif # GGML_CUDA_MMV_Y - ifdef GGML_CUDA_F16 MUSAFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_F16 @@ -902,12 +850,6 @@ ifdef GGML_CUDA_DMMV_F16 MUSAFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_DMMV_F16 -ifdef GGML_CUDA_KQUANTS_ITER - MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) -else - MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2 -endif - ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) else @@ -922,9 +864,9 @@ ifdef GGML_CUDA_FA_ALL_QUANTS MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS - OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML += $(OBJ_CUDA_TMPL) + OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o + OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) ggml/src/ggml-cuda/ggml-cuda.o: \ ggml/src/ggml-cuda/ggml-cuda.cu \ @@ -945,9 +887,9 @@ ggml/src/ggml-cuda/%.o: \ endif # GGML_MUSA ifdef GGML_METAL - MK_CPPFLAGS += -DGGML_USE_METAL - MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJ_GGML += ggml/src/ggml-metal/ggml-metal.o + MK_CPPFLAGS += -DGGML_USE_METAL + MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit + OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o ifdef GGML_METAL_USE_BF16 MK_CPPFLAGS += -DGGML_METAL_USE_BF16 @@ -956,14 +898,15 @@ ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif ifdef GGML_METAL_EMBED_LIBRARY - MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY - OBJ_GGML += ggml/src/ggml-metal-embed.o + MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY + OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o endif endif # GGML_METAL ifdef GGML_METAL ggml/src/ggml-metal/ggml-metal.o: \ ggml/src/ggml-metal/ggml-metal.m \ + ggml/src/ggml-metal/ggml-metal-impl.h \ ggml/include/ggml-metal.h \ ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ @@ -971,9 +914,11 @@ ggml/src/ggml-metal/ggml-metal.o: \ ifdef GGML_METAL_EMBED_LIBRARY ggml/src/ggml-metal-embed.o: \ ggml/src/ggml-metal/ggml-metal.metal \ + ggml/src/ggml-metal/ggml-metal-impl.h \ ggml/src/ggml-common.h @echo "Embedding Metal library" - @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal + @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp + @sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @@ -987,36 +932,42 @@ ggml/src/ggml-metal-embed.o: \ endif endif # GGML_METAL -OBJ_GGML += \ - ggml/src/ggml.o \ - ggml/src/ggml-aarch64.o \ - ggml/src/ggml-alloc.o \ - ggml/src/ggml-backend.o \ - ggml/src/ggml-backend-reg.o \ - ggml/src/ggml-quants.o \ - ggml/src/ggml-threading.o \ - ggml/src/ggml-cpu/ggml-cpu.o \ - ggml/src/ggml-cpu/ggml-cpu-cpp.o \ - ggml/src/ggml-cpu/ggml-cpu-aarch64.o \ - ggml/src/ggml-cpu/ggml-cpu-quants.o +DIR_GGML = ggml +DIR_LLAMA = src +DIR_COMMON = common + +OBJ_GGML = \ + $(DIR_GGML)/src/ggml.o \ + $(DIR_GGML)/src/ggml-aarch64.o \ + $(DIR_GGML)/src/ggml-alloc.o \ + $(DIR_GGML)/src/ggml-backend.o \ + $(DIR_GGML)/src/ggml-backend-reg.o \ + $(DIR_GGML)/src/ggml-opt.o \ + $(DIR_GGML)/src/ggml-quants.o \ + $(DIR_GGML)/src/ggml-threading.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ + $(OBJ_GGML_EXT) OBJ_LLAMA = \ - src/llama.o \ - src/llama-vocab.o \ - src/llama-grammar.o \ - src/llama-sampling.o \ - src/unicode.o \ - src/unicode-data.o + $(DIR_LLAMA)/llama.o \ + $(DIR_LLAMA)/llama-vocab.o \ + $(DIR_LLAMA)/llama-grammar.o \ + $(DIR_LLAMA)/llama-sampling.o \ + $(DIR_LLAMA)/unicode.o \ + $(DIR_LLAMA)/unicode-data.o OBJ_COMMON = \ - common/common.o \ - common/arg.o \ - common/log.o \ - common/console.o \ - common/ngram-cache.o \ - common/sampling.o \ - common/build-info.o \ - common/json-schema-to-grammar.o + $(DIR_COMMON)/common.o \ + $(DIR_COMMON)/arg.o \ + $(DIR_COMMON)/log.o \ + $(DIR_COMMON)/console.o \ + $(DIR_COMMON)/ngram-cache.o \ + $(DIR_COMMON)/sampling.o \ + $(DIR_COMMON)/build-info.o \ + $(DIR_COMMON)/json-schema-to-grammar.o OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) @@ -1117,246 +1068,78 @@ endif # Build libraries # -# ggml +# Libraries +LIB_GGML = libggml.so +LIB_GGML_S = libggml.a -ggml/src/ggml.o: \ - ggml/src/ggml.c \ - ggml/include/ggml.h - $(CC) $(CFLAGS) -c $< -o $@ +LIB_LLAMA = libllama.so +LIB_LLAMA_S = libllama.a -ggml/src/ggml-threading.o: \ - ggml/src/ggml-threading.cpp \ - ggml/include/ggml.h - $(CXX) $(XXCFLAGS) -c $< -o $@ +LIB_COMMON = libcommon.so +LIB_COMMON_S = libcommon.a -ggml/src/ggml-cpu/ggml-cpu.o: \ - ggml/src/ggml-cpu/ggml-cpu.c \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h - $(CC) $(CFLAGS) -c $< -o $@ +# Targets +BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S) -ggml/src/ggml-cpu/ggml-cpu-cpp.o: \ +# Dependency files +DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) + +# Default target +all: $(BUILD_TARGETS) + +# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files +# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml +$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ ggml/src/ggml-cpu/ggml-cpu.cpp \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ggml/src/ggml-alloc.o: \ - ggml/src/ggml-alloc.c \ - ggml/include/ggml.h \ - ggml/include/ggml-alloc.h - $(CC) $(CFLAGS) -c $< -o $@ - -ggml/src/ggml-backend.o: \ - ggml/src/ggml-backend.cpp \ - ggml/src/ggml-backend-impl.h \ - ggml/include/ggml.h \ - ggml/include/ggml-backend.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ggml/src/ggml-quants.o: \ - ggml/src/ggml-quants.c \ - ggml/include/ggml.h \ - ggml/src/ggml-quants.h \ - ggml/src/ggml-common.h - $(CC) $(CFLAGS) -c $< -o $@ - -ggml/src/ggml-aarch64.o: \ - ggml/src/ggml-aarch64.c \ - ggml/include/ggml.h \ - ggml/src/ggml-aarch64.h \ - ggml/src/ggml-common.h - $(CC) $(CFLAGS) -c $< -o $@ - -ggml/src/ggml-blas/ggml-blas.o: \ - ggml/src/ggml-blas/ggml-blas.cpp \ - ggml/include/ggml-blas.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ifndef GGML_NO_LLAMAFILE -ggml/src/ggml-cpu/llamafile/sgemm.o: \ - ggml/src/ggml-cpu/llamafile/sgemm.cpp \ - ggml/src/ggml-cpu/llamafile/sgemm.h \ - ggml/include/ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -I ggml/src -I ggml/src/ggml-cpu -endif # GGML_NO_LLAMAFILE - -ifndef GGML_NO_AMX -ggml/src/ggml-amx/ggml-amx.o: \ - ggml/src/ggml-amx/ggml-amx.cpp \ - ggml/include/ggml-amx.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -ggml/src/ggml-amx/mmq.o: \ - ggml/src/ggml-amx/mmq.cpp \ - ggml/src/ggml-amx/mmq.h \ - ggml/include/ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -endif - -ifdef GGML_RPC -ggml/src/ggml-rpc.o: \ - ggml/src/ggml-rpc.cpp \ - ggml/include/ggml-rpc.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # GGML_RPC - -$(LIB_GGML): \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) - -$(LIB_GGML_S): \ - $(OBJ_GGML) - ar rcs $(LIB_GGML_S) $^ - -# llama - -src/unicode.o: \ - src/unicode.cpp \ - src/unicode.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -src/unicode-data.o: \ - src/unicode-data.cpp \ - src/unicode-data.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -src/llama.o: \ - src/llama.cpp \ - src/llama-impl.h \ - src/llama-vocab.h \ - src/llama-grammar.h \ - src/llama-sampling.h \ - src/unicode.h \ - include/llama.h \ - ggml/include/ggml-cuda.h \ - ggml/include/ggml-metal.h \ + ggml/include/ggml-backend.h \ ggml/include/ggml.h \ ggml/include/ggml-alloc.h \ - ggml/include/ggml-backend.h - $(CXX) $(CXXFLAGS) -c $< -o $@ + ggml/src/ggml-backend-impl.h \ + ggml/include/ggml-cpu.h \ + ggml/src/ggml-impl.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -src/llama-vocab.o: \ - src/llama-vocab.cpp \ - src/llama-vocab.h \ - src/llama-impl.h \ - include/llama.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +# Rules for building object files +$(DIR_GGML)/%.o: $(DIR_GGML)/%.c + $(CC) $(CFLAGS) -MMD -c $< -o $@ -src/llama-grammar.o: \ - src/llama-grammar.cpp \ - src/llama-grammar.h \ - src/llama-impl.h \ - src/llama-vocab.h \ - src/llama-sampling.h \ - include/llama.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ -src/llama-sampling.o: \ - src/llama-sampling.cpp \ - src/llama-sampling.h \ - src/llama-impl.h \ - include/llama.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ -$(LIB_LLAMA): \ - $(OBJ_LLAMA) \ - $(LIB_GGML) +$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ + +# Rules for building libraries +$(LIB_GGML): $(OBJ_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_LLAMA_S): \ - $(OBJ_LLAMA) +$(LIB_GGML_S): $(OBJ_GGML) + ar rcs $(LIB_GGML_S) $^ + +$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) + +$(LIB_LLAMA_S): $(OBJ_LLAMA) ar rcs $(LIB_LLAMA_S) $^ -# common - -common/common.o: \ - common/common.cpp \ - common/common.h \ - common/console.h \ - common/sampling.h \ - common/json.hpp \ - common/json-schema-to-grammar.h \ - include/llama.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common/arg.o: \ - common/arg.cpp \ - common/arg.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common/log.o: \ - common/log.cpp \ - common/log.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common/sampling.o: \ - common/sampling.cpp \ - common/sampling.h \ - include/llama.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common/console.o: \ - common/console.cpp \ - common/console.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common/json-schema-to-grammar.o: \ - common/json-schema-to-grammar.cpp \ - common/json-schema-to-grammar.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -common/ngram-cache.o: \ - common/ngram-cache.cpp \ - common/ngram-cache.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - -$(LIB_COMMON): \ - $(OBJ_COMMON) \ - $(LIB_LLAMA) \ - $(LIB_GGML) +$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -$(LIB_COMMON_S): \ - $(OBJ_COMMON) +$(LIB_COMMON_S): $(OBJ_COMMON) ar rcs $(LIB_COMMON_S) $^ +# Include dependency files +-include $(DEP_FILES) + +# Clean rule clean: - rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS) - rm -rvf src/*.o - rm -rvf tests/*.o - rm -rvf examples/*.o - rm -rvf common/*.o - rm -rvf *.a - rm -rvf *.dll - rm -rvf *.so - rm -rvf *.dot - rm -rvf ggml/*.a - rm -rvf ggml/*.dll - rm -rvf ggml/*.so - rm -rvf ggml/src/*.o - rm -rvf common/build-info.cpp - rm -rvf ggml/src/ggml-cpu/*.o - rm -rvf ggml/src/ggml-cpu/llamafile/*.o - rm -vrf ggml/src/ggml-amx/*.o - rm -vrf ggml/src/ggml-blas/*.o - rm -vrf ggml/src/ggml-cann/*.o - rm -vrf ggml/src/ggml-cpu/*.o - rm -vrf ggml/src/ggml-cuda/*.o - rm -vrf ggml/src/ggml-cuda/template-instances/*.o - rm -vrf ggml/src/ggml-hip/*.o - rm -vrf ggml/src/ggml-kompute/*.o - rm -vrf ggml/src/ggml-metal/*.o - rm -vrf ggml/src/ggml-metal/ggml-metal-embed.metal - rm -vrf ggml/src/ggml-rpc/*.o - rm -vrf ggml/src/ggml-sycl/*.o - rm -vrf ggml/src/ggml-vulkan/*.o - rm -vrf ggml/src/ggml-musa/*.o - rm -rvf $(BUILD_TARGETS) - rm -rvf $(TEST_TARGETS) - rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp - rm -rvf $(LEGACY_TARGETS_CLEAN) - find examples pocs -type f -name "*.o" -delete + rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS) + rm -rvf *.a *.dll *.so *.dot + find ggml src common tests examples pocs -type f -name "*.o" -delete + find ggml src common tests examples pocs -type f -name "*.d" -delete # # Examples @@ -1662,11 +1445,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - tests/test-opt: tests/test-opt.cpp \ $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/README.md b/README.md index 6ab6acf12..5f7933c13 100644 --- a/README.md +++ b/README.md @@ -459,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) -## Other documentations +## Other documentation - [main (cli)](./examples/main/README.md) - [server](./examples/server/README.md) - [jeopardy](./examples/jeopardy/README.md) - [GBNF grammars](./grammars/README.md) -**Development documentations** +**Development documentation** - [How to build](./docs/build.md) - [Running on Docker](./docs/docker.md) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index ef68417b4..5c55bc6b8 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,18 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) -set(GGML_BLAS @GGML_BLAS@) -set(GGML_CUDA @GGML_CUDA@) -set(GGML_METAL @GGML_METAL@) -set(GGML_HIP @GGML_HIP@) +set(GGML_STATIC @GGML_STATIC@) +set(GGML_NATIVE @GGML_NATIVE@) +set(GGML_LTO @GGML_LTO@) +set(GGML_CCACHE @GGML_CCACHE@) +set(GGML_AVX @GGML_AVX@) +set(GGML_AVX2 @GGML_AVX2@) +set(GGML_AVX512 @GGML_AVX512@) +set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@) +set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@) +set(GGML_AVX512_BF16 @GGML_AVX512_BF16@) +set(GGML_AMX_TILE @GGML_AMX_TILE@) +set(GGML_AMX_INT8 @GGML_AMX_INT8@) +set(GGML_AMX_BF16 @GGML_AMX_BF16@) +set(GGML_FMA @GGML_FMA@) +set(GGML_LASX @GGML_LASX@) +set(GGML_LSX @GGML_LSX@) +set(GGML_RVV @GGML_RVV@) +set(GGML_SVE @GGML_SVE@) + set(GGML_ACCELERATE @GGML_ACCELERATE@) -set(GGML_VULKAN @GGML_VULKAN@) +set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_CPU_HBM @GGML_CPU_HBM@) +set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@) + +set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@) +set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@) +set(GGML_CUDA_F16 @GGML_CUDA_F16@) +set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@) +set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@) +set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@) +set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@) +set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@) + +set(GGML_HIP_UMA @GGML_HIP_UMA@) + set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) -set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) -set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) -set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) -set(GGML_SYCL @GGML_SYCL@) -set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) +set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) +set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@) +set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@) +set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) +set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@) + +set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@) +set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@) +set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@) +set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@) +set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@) +set(GGML_METAL_STD @GGML_METAL_STD@) + +set(GGML_SYCL_F16 @GGML_SYCL_F16@) +set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@) +set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@) + @PACKAGE_INIT@ @@ -22,65 +64,111 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") -# Ensure transient dependencies satisfied - find_package(Threads REQUIRED) -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) +set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") +set(_llama_link_deps "") +set(_llama_link_opts "") +foreach(_ggml_lib ggml ggml-base) + string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") + find_library(${_ggml_lib_var} ${_ggml_lib} + REQUIRED + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH + ) + list(APPEND _llama_link_deps "${${_ggml_lib_var}}") + message(STATUS "Found ${${_ggml_lib_var}}") +endforeach() + +foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan) + string(TOUPPER "GGML_${backend}" backend_id) + set(_ggml_lib "ggml-${backend}") + string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") + + find_library(${_ggml_lib_var} ${_ggml_lib} + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH + ) + if(${_ggml_lib_var}) + list(APPEND _llama_link_deps "${${_ggml_lib_var}}") + set(${backend_id} ON) + message(STATUS "Found backend ${${_ggml_lib_var}}") + else() + set(${backend_id} OFF) + endif() +endforeach() + +if (NOT LLAMA_SHARED_LIB) + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) + list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK}) + endif() + + if (GGML_OPENMP) + find_package(OpenMP REQUIRED) + list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + endif() + + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + list(APPEND _llama_link_deps memkind) + endif() + + if (GGML_BLAS) + find_package(BLAS REQUIRED) + list(APPEND _llama_link_deps ${BLAS_LIBRARIES}) + list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS}) + endif() + + if (GGML_CUDA) + find_package(CUDAToolkit REQUIRED) + endif() + + if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + endif() + + if (GGML_VULKAN) + find_package(Vulkan REQUIRED) + list(APPEND _llama_link_deps Vulkan::Vulkan) + endif() + + if (GGML_HIP) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas) + endif() + + if (GGML_SYCL) + find_package(DNNL) + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND _llama_link_deps DNNL::dnnl) + endif() + if (WIN32) + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + endif() + endif() endif() -if (GGML_BLAS) - find_package(BLAS REQUIRED) -endif() - -if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) -endif() - -if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) -endif() - -if (GGML_VULKAN) - find_package(Vulkan REQUIRED) -endif() - -if (GGML_HIPBLAS) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) -endif() - -if (GGML_SYCL) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) -endif() - -if (GGML_OPENMP) - find_package(OpenMP REQUIRED) -endif() - - -find_library(ggml_LIBRARY ggml - REQUIRED - HINTS ${LLAMA_LIB_DIR}) - find_library(llama_LIBRARY llama REQUIRED - HINTS ${LLAMA_LIB_DIR}) - -set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") -set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH +) add_library(llama UNKNOWN IMPORTED) - set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + INTERFACE_LINK_OPTIONS "${_llama_link_opts}" INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LOCATION "${llama_LIBRARY}" diff --git a/common/arg.cpp b/common/arg.cpp index 7c5c5e5cd..4115b2f75 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1939,17 +1939,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(common_arg( - {"-ld", "--logdir"}, "LOGDIR", - "path under which to save YAML logs (no logging if unset)", - [](common_params & params, const std::string & value) { - params.logdir = value; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } - )); add_opt(common_arg( {"--positive-file"}, "FNAME", string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), diff --git a/common/common.cpp b/common/common.cpp index ebd16b600..d314523db 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } + if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { + LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__); + llama_free_model(model); + return iparams; + } + if (!params.control_vectors.empty()) { if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); @@ -1890,213 +1896,3 @@ common_control_vector_data common_control_vector_load(const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%e, ", data[i]); - } - fprintf(stream, "%e]\n", data.back()); -} - -void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%d, ", data[i]); - } - fprintf(stream, "%d]\n", data.back()); -} - -void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) { - std::string data_str(data == NULL ? "" : data); - - if (data_str.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - size_t pos_start = 0; - size_t pos_found = 0; - - if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { - data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); - data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); - data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); - data_str = "\"" + data_str + "\""; - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - if (data_str.find('\n') == std::string::npos) { - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - fprintf(stream, "%s: |\n", prop_name); - while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { - fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); - pos_start = pos_found + 1; - } -} - -void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - ggml_cpu_init(); // some ARM features are detected at runtime - - const auto & sparams = params.sparams; - - fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); - fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); - fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); - -#ifdef NDEBUG - fprintf(stream, "debug: false\n"); -#else - fprintf(stream, "debug: true\n"); -#endif // NDEBUG - - fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); - -#ifdef __OPTIMIZE__ - fprintf(stream, "optimize: true\n"); -#else - fprintf(stream, "optimize: false\n"); -#endif // __OPTIMIZE__ - - fprintf(stream, "time: %s\n", timestamp.c_str()); - - fprintf(stream, "\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "# User Inputs #\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "\n"); - - fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); - fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); - fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); - fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); - fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); - fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length); - fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base); - fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier); - fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n); - fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); - fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); - yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str()); - fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); - fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false"); - - yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); - fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); - yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); - fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); - fprintf(stream, "keep: %d # default: 0\n", params.n_keep); - fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); - - fprintf(stream, "logit_bias:\n"); - for (const auto & logit_bias : sparams.logit_bias) { - fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias); - } - - fprintf(stream, "lora:\n"); - for (auto & la : params.lora_adapters) { - if (la.scale == 1.0f) { - fprintf(stream, " - %s\n", la.path.c_str()); - } - } - fprintf(stream, "lora_scaled:\n"); - for (auto & la : params.lora_adapters) { - if (la.scale != 1.0f) { - fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); - } - } - fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); - fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); - fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); - fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); - fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); - fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); - fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); - fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); - fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); - fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); - fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); - fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); - fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); - fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); - yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str()); - fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); - fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); - fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); - yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); - - fprintf(stream, "reverse_prompt:\n"); - for (std::string ap : params.antiprompt) { - size_t pos = 0; - while ((pos = ap.find('\n', pos)) != std::string::npos) { - ap.replace(pos, 1, "\\n"); - pos += 1; - } - - fprintf(stream, " - %s\n", ap.c_str()); - } - - fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); - fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); - fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); - fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); - fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); - yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); - - fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency()); - fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); - fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); - fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); - fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); - fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold); - fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); - fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); - fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); -} diff --git a/common/common.h b/common/common.h index 6289feaeb..7977cc7a9 100644 --- a/common/common.h +++ b/common/common.h @@ -209,7 +209,6 @@ struct common_params { std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT std::string input_prefix = ""; // string to prefix user inputs with // NOLINT std::string input_suffix = ""; // string to suffix user inputs with // NOLINT - std::string logdir = ""; // directory in which to save YAML log files // NOLINT std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT @@ -584,15 +583,3 @@ common_control_vector_data common_control_vector_load(const std::vector & data); -void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector & data); -void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); - -void yaml_dump_non_result_info( - FILE * stream, const common_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 39afa5ef4..9f4b8154b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3040,6 +3040,11 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("Olmo1124ForCausalLM") +class Olmo1124Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO_1124 + + @Model.register("OlmoeForCausalLM") class OlmoeModel(Model): model_arch = gguf.MODEL_ARCH.OLMOE diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 38185f738..8d8312e91 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -34,9 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI. The following release is verified with good quality: -|Commit ID|Tag|Release|Verified Platform| -|-|-|-|-| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| +|Commit ID|Tag|Release|Verified Platform| Update date| +|-|-|-|-|-| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| ## News @@ -312,12 +313,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR # Build LLAMA with Nvidia BLAS acceleration through SYCL +# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance +GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v @@ -335,8 +338,9 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE ## AMD # Use FP32, FP16 is not supported -# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:' -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:' +GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # build all binary cmake --build build --config Release -j -v @@ -646,6 +650,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |--------------------|---------------------------------------|---------------------------------------------| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | +| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/docs/build.md b/docs/build.md index 52de2b4e2..359952b30 100644 --- a/docs/build.md +++ b/docs/build.md @@ -186,13 +186,9 @@ The following compilation options are also available to tweak performance: | Option | Legal values | Default | Description | |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | -| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | -| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | -| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | @@ -268,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. -The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): - -| Option | Legal values | Default | Description | -|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | -| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | -| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | ### Vulkan @@ -282,9 +271,9 @@ The following compilation options are also available to tweak performance (yes, #### w64devkit -Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases). +Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases). -Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required. +Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies: ```sh @@ -302,6 +291,29 @@ EOF ``` Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`. +#### Git Bash MINGW64 + +Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings + +Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++` + +Download and install [`CMake`](https://cmake.org/download/) with the default settings + +Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings. + +Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands + +``` +cmake -B build -DGGML_VULKAN=ON +cmake --build build --config Release +``` + +Now you can load the model in conversation mode using `Vulkan` + +``` +build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv +``` + #### MSYS2 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies. ```sh diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index f18362c91..15b358dc4 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -43,50 +43,6 @@ static std::vector * g_output_tokens; static bool is_interacting = false; -static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens -) { - if (params.logdir.empty()) { - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: infill\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Generation Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_string_multiline(logfile, "output", output.c_str()); - yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -96,7 +52,6 @@ static void sigint_handler(int signo) { console::cleanup(); LOG("\n"); common_perf_print(*g_ctx, *g_smpl); - write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); @@ -625,7 +580,6 @@ int main(int argc, char ** argv) { LOG("\n"); common_perf_print(ctx, smpl); - write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8f4e0e206..3dc84a75c 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -6,28 +6,28 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include #include -#include #include +#include +#include "common.h" #include "ggml.h" #include "llama.h" -#include "common.h" #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include #endif // utils @@ -36,8 +36,7 @@ static uint64_t get_time_ns() { return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); } -template -static std::string join(const std::vector & values, const std::string & delim) { +template static std::string join(const std::vector & values, const std::string & delim) { std::ostringstream str; for (size_t i = 0; i < values.size(); i++) { str << values[i]; @@ -48,38 +47,35 @@ static std::string join(const std::vector & values, const std::string & delim return str.str(); } -template -static std::vector transform_to_str(const std::vector & values, F f) { +template static std::vector transform_to_str(const std::vector & values, F f) { std::vector str_values; std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); return str_values; } -template -static T avg(const std::vector & v) { +template static T avg(const std::vector & v) { if (v.empty()) { return 0; } T sum = std::accumulate(v.begin(), v.end(), T(0)); - return sum / (T)v.size(); + return sum / (T) v.size(); } -template -static T stdev(const std::vector & v) { +template static T stdev(const std::vector & v) { if (v.size() <= 1) { return 0; } - T mean = avg(v); + T mean = avg(v); T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); + T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1)); return stdev; } static std::string get_cpu_info() { std::vector cpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { cpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -90,8 +86,8 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::vector gpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) { gpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -100,17 +96,24 @@ static std::string get_gpu_info() { } // command line params -enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL}; +enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; static const char * output_format_str(output_formats format) { switch (format) { - case NONE: return "none"; - case CSV: return "csv"; - case JSON: return "json"; - case JSONL: return "jsonl"; - case MARKDOWN: return "md"; - case SQL: return "sql"; - default: GGML_ABORT("invalid output format"); + case NONE: + return "none"; + case CSV: + return "csv"; + case JSON: + return "json"; + case JSONL: + return "jsonl"; + case MARKDOWN: + return "md"; + case SQL: + return "sql"; + default: + GGML_ABORT("invalid output format"); } } @@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ABORT("invalid split mode"); + case LLAMA_SPLIT_MODE_NONE: + return "none"; + case LLAMA_SPLIT_MODE_LAYER: + return "layer"; + case LLAMA_SPLIT_MODE_ROW: + return "row"; + default: + GGML_ABORT("invalid split mode"); } } @@ -149,59 +156,59 @@ static std::string pair_str(const std::pair & p) { } struct cmd_params { - std::vector model; - std::vector n_prompt; - std::vector n_gen; + std::vector model; + std::vector n_prompt; + std::vector n_gen; std::vector> n_pg; - std::vector n_batch; - std::vector n_ubatch; - std::vector type_k; - std::vector type_v; - std::vector n_threads; - std::vector cpu_mask; - std::vector cpu_strict; - std::vector poll; - std::vector n_gpu_layers; - std::vector rpc_servers; - std::vector split_mode; - std::vector main_gpu; - std::vector no_kv_offload; - std::vector flash_attn; - std::vector> tensor_split; - std::vector use_mmap; - std::vector embeddings; - ggml_numa_strategy numa; - int reps; - ggml_sched_priority prio; - int delay; - bool verbose; - bool progress; - output_formats output_format; - output_formats output_format_stderr; + std::vector n_batch; + std::vector n_ubatch; + std::vector type_k; + std::vector type_v; + std::vector n_threads; + std::vector cpu_mask; + std::vector cpu_strict; + std::vector poll; + std::vector n_gpu_layers; + std::vector rpc_servers; + std::vector split_mode; + std::vector main_gpu; + std::vector no_kv_offload; + std::vector flash_attn; + std::vector> tensor_split; + std::vector use_mmap; + std::vector embeddings; + ggml_numa_strategy numa; + int reps; + ggml_sched_priority prio; + int delay; + bool verbose; + bool progress; + output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, + /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* n_prompt */ { 512 }, + /* n_gen */ { 128 }, /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* cpu_mask */ {"0x0"}, - /* cpu_strict */ {false}, - /* poll */ {50}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, + /* n_batch */ { 2048 }, + /* n_ubatch */ { 512 }, + /* type_k */ { GGML_TYPE_F16 }, + /* type_v */ { GGML_TYPE_F16 }, + /* n_threads */ { cpu_get_num_math() }, + /* cpu_mask */ { "0x0" }, + /* cpu_strict */ { false }, + /* poll */ { 50 }, + /* n_gpu_layers */ { 99 }, + /* rpc_servers */ { "" }, + /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* main_gpu */ { 0 }, + /* no_kv_offload */ { false }, + /* flash_attn */ { false }, + /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, + /* use_mmap */ { true }, + /* embeddings */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) { printf("options:\n"); printf(" -h, --help\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", + join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -C, --cpu-mask (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); - printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" -pg (default: %s)\n", + join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", + join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", + join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", + join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", + join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", + join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", + join(cmd_params_defaults.n_gpu_layers, ",").c_str()); if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + printf(" -rpc, --rpc (default: %s)\n", + join(cmd_params_defaults.rpc_servers, ",").c_str()); } - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", + join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", + join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", + join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", + join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", + join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); - printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", + join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); - printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" -o, --output (default: %s)\n", + output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", + output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0"); printf("\n"); - printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); + printf( + "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " + "multiple times.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) { return GGML_TYPE_COUNT; } - static cmd_params parse_cmd_params(int argc, char ** argv) { - cmd_params params; - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - const char split_delim = ','; + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; - params.verbose = cmd_params_defaults.verbose; - params.output_format = cmd_params_defaults.output_format; + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; - params.reps = cmd_params_defaults.reps; - params.numa = cmd_params_defaults.numa; - params.prio = cmd_params_defaults.prio; - params.delay = cmd_params_defaults.delay; - params.progress = cmd_params_defaults.progress; + params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; + params.prio = cmd_params_defaults.prio; + params.delay = cmd_params_defaults.delay; + params.progress = cmd_params_defaults.progress; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector modes; for (const auto & m : p) { llama_split_mode mode; @@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } + /**/ if (value == "distribute" || value == "") { + params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; + } else if (value == "isolate") { + params.numa = GGML_NUMA_STRATEGY_ISOLATE; + } else if (value == "numactl") { + params.numa = GGML_NUMA_STRATEGY_NUMACTL; + } else { + invalid_param = true; + break; + } } } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { @@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } for (auto ts : string_split(argv[i], split_delim)) { // split string by ; and / - const std::regex regex{R"([;/]+)"}; - std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; - std::vector split_arg{it, {}}; + const std::regex regex{ R"([;/]+)" }; + std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; + std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= llama_max_devices()); std::vector tensor_split(llama_max_devices()); @@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } // set defaults - if (params.model.empty()) { params.model = cmd_params_defaults.model; } - if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } - if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } - if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } - if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } - if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } - if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } - if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } - if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } - if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } - if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } - if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } - if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } - if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } - if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } - if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } - if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.model.empty()) { + params.model = cmd_params_defaults.model; + } + if (params.n_prompt.empty()) { + params.n_prompt = cmd_params_defaults.n_prompt; + } + if (params.n_gen.empty()) { + params.n_gen = cmd_params_defaults.n_gen; + } + if (params.n_pg.empty()) { + params.n_pg = cmd_params_defaults.n_pg; + } + if (params.n_batch.empty()) { + params.n_batch = cmd_params_defaults.n_batch; + } + if (params.n_ubatch.empty()) { + params.n_ubatch = cmd_params_defaults.n_ubatch; + } + if (params.type_k.empty()) { + params.type_k = cmd_params_defaults.type_k; + } + if (params.type_v.empty()) { + params.type_v = cmd_params_defaults.type_v; + } + if (params.n_gpu_layers.empty()) { + params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; + } + if (params.rpc_servers.empty()) { + params.rpc_servers = cmd_params_defaults.rpc_servers; + } + if (params.split_mode.empty()) { + params.split_mode = cmd_params_defaults.split_mode; + } + if (params.main_gpu.empty()) { + params.main_gpu = cmd_params_defaults.main_gpu; + } + if (params.no_kv_offload.empty()) { + params.no_kv_offload = cmd_params_defaults.no_kv_offload; + } + if (params.flash_attn.empty()) { + params.flash_attn = cmd_params_defaults.flash_attn; + } + if (params.tensor_split.empty()) { + params.tensor_split = cmd_params_defaults.tensor_split; + } + if (params.use_mmap.empty()) { + params.use_mmap = cmd_params_defaults.use_mmap; + } + if (params.embeddings.empty()) { + params.embeddings = cmd_params_defaults.embeddings; + } + if (params.n_threads.empty()) { + params.n_threads = cmd_params_defaults.n_threads; + } + if (params.cpu_mask.empty()) { + params.cpu_mask = cmd_params_defaults.cpu_mask; + } + if (params.cpu_strict.empty()) { + params.cpu_strict = cmd_params_defaults.cpu_strict; + } + if (params.poll.empty()) { + params.poll = cmd_params_defaults.poll; + } return params; } struct cmd_params_instance { - std::string model; - int n_prompt; - int n_gen; - int n_batch; - int n_ubatch; - ggml_type type_k; - ggml_type type_v; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - int n_gpu_layers; - std::string rpc_servers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; + std::string model; + int n_prompt; + int n_gen; + int n_batch; + int n_ubatch; + ggml_type type_k; + ggml_type type_v; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + int n_gpu_layers; + std::string rpc_servers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; std::vector tensor_split; - bool use_mmap; - bool embeddings; + bool use_mmap; + bool embeddings; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -624,35 +699,31 @@ struct cmd_params_instance { if (!rpc_servers.empty()) { mparams.rpc_servers = rpc_servers.c_str(); } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; + mparams.use_mmap = use_mmap; return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && - n_gpu_layers == other.n_gpu_layers && - rpc_servers == other.rpc_servers && - split_mode == other.split_mode && - main_gpu == other.main_gpu && - use_mmap == other.use_mmap && + return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers && + split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split; } llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; return cparams; } @@ -662,6 +733,7 @@ static std::vector get_cmd_params_instances(const cmd_param std::vector instances; // this ordering minimizes the number of times that each model needs to be reloaded + // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & rpc : params.rpc_servers) @@ -767,100 +839,94 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } + // clang-format on return instances; } struct test { static const std::string build_commit; - static const int build_number; + static const int build_number; static const std::string cpu_info; static const std::string gpu_info; - std::string model_filename; - std::string model_type; - uint64_t model_size; - uint64_t model_n_params; - int n_batch; - int n_ubatch; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - ggml_type type_k; - ggml_type type_v; - int n_gpu_layers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - int n_prompt; - int n_gen; - std::string test_time; - std::vector samples_ns; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_ubatch; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + ggml_type type_k; + ggml_type type_v; + int n_gpu_layers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector tensor_split; + bool use_mmap; + bool embeddings; + int n_prompt; + int n_gen; + std::string test_time; + std::vector samples_ns; test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; llama_model_desc(lmodel, buf, sizeof(buf)); - model_type = buf; - model_size = llama_model_size(lmodel); + model_type = buf; + model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); - n_batch = inst.n_batch; - n_ubatch = inst.n_ubatch; - n_threads = inst.n_threads; - cpu_mask = inst.cpu_mask; - cpu_strict = inst.cpu_strict; - poll = inst.poll; - type_k = inst.type_k; - type_v = inst.type_v; - n_gpu_layers = inst.n_gpu_layers; - split_mode = inst.split_mode; - main_gpu = inst.main_gpu; - no_kv_offload = inst.no_kv_offload; - flash_attn = inst.flash_attn; - tensor_split = inst.tensor_split; - use_mmap = inst.use_mmap; - embeddings = inst.embeddings; - n_prompt = inst.n_prompt; - n_gen = inst.n_gen; + n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; + n_threads = inst.n_threads; + cpu_mask = inst.cpu_mask; + cpu_strict = inst.cpu_strict; + poll = inst.poll; + type_k = inst.type_k; + type_v = inst.type_v; + n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; + main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; + tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; + embeddings = inst.embeddings; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; // RFC 3339 date-time format - time_t t = time(NULL); + time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); test_time = buf; (void) ctx; } - uint64_t avg_ns() const { - return ::avg(samples_ns); - } + uint64_t avg_ns() const { return ::avg(samples_ns); } - uint64_t stdev_ns() const { - return ::stdev(samples_ns); - } + uint64_t stdev_ns() const { return ::stdev(samples_ns); } std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + int n_tokens = n_prompt + n_gen; std::vector ts; - std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), + [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } - double avg_ts() const { - return ::avg(get_ts()); - } + double avg_ts() const { return ::avg(get_ts()); } - double stdev_ts() const { - return ::stdev(get_ts()); - } + double stdev_ts() const { return ::stdev(get_ts()); } static std::string get_backend() { std::vector backends; for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - auto * reg = ggml_backend_reg_get(i); + auto * reg = ggml_backend_reg_get(i); std::string name = ggml_backend_reg_name(reg); if (name != "CPU") { backends.push_back(ggml_backend_reg_name(reg)); @@ -871,36 +937,27 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", - "cpu_info", "gpu_info", "backends", - "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_ubatch", - "n_threads", "cpu_mask", "cpu_strict", "poll", - "type_k", "type_v", - "n_gpu_layers", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", - "tensor_split", "use_mmap", "embeddings", - "n_prompt", "n_gen", "test_time", - "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts", }; return fields; } - enum field_type {STRING, BOOL, INT, FLOAT}; + enum field_type { STRING, BOOL, INT, FLOAT }; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || - field == "n_threads" || field == "poll" || - field == "model_size" || field == "model_n_params" || - field == "n_gpu_layers" || field == "main_gpu" || - field == "n_prompt" || field == "n_gen" || - field == "avg_ns" || field == "stddev_ns") { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || + field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || + field == "stddev_ns") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || - field == "cpu_strict" || - field == "flash_attn" || field == "use_mmap" || field == "embeddings") { + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -911,7 +968,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; - int max_nonzero = 0; + int max_nonzero = 0; for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; @@ -925,29 +982,47 @@ struct test { tensor_split_str += "/"; } } - std::vector values = { - build_commit, std::to_string(build_number), - cpu_info, gpu_info, get_backend(), - model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_ubatch), - std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), - ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), split_mode_str(split_mode), - std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), - tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), std::to_string(stdev_ns()), - std::to_string(avg_ts()), std::to_string(stdev_ts()) - }; + std::vector values = { build_commit, + std::to_string(build_number), + cpu_info, + gpu_info, + get_backend(), + model_filename, + model_type, + std::to_string(model_size), + std::to_string(model_n_params), + std::to_string(n_batch), + std::to_string(n_ubatch), + std::to_string(n_threads), + cpu_mask, + std::to_string(cpu_strict), + std::to_string(poll), + ggml_type_name(type_k), + ggml_type_name(type_v), + std::to_string(n_gpu_layers), + split_mode_str(split_mode), + std::to_string(main_gpu), + std::to_string(no_kv_offload), + std::to_string(flash_attn), + tensor_split_str, + std::to_string(use_mmap), + std::to_string(embeddings), + std::to_string(n_prompt), + std::to_string(n_gen), + test_time, + std::to_string(avg_ns()), + std::to_string(stdev_ns()), + std::to_string(avg_ts()), + std::to_string(stdev_ts()) }; return values; } std::map get_map() const { std::map map; - auto fields = get_fields(); - auto values = get_values(); - std::transform(fields.begin(), fields.end(), values.begin(), - std::inserter(map, map.end()), std::make_pair); + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()), + std::make_pair); return map; } }; @@ -961,9 +1036,12 @@ struct printer { virtual ~printer() {} FILE * fout; + virtual void print_header(const cmd_params & params) { (void) params; } + virtual void print_test(const test & t) = 0; - virtual void print_footer() { } + + virtual void print_footer() {} }; struct csv_printer : public printer { @@ -979,7 +1057,7 @@ struct csv_printer : public printer { return escaped; } - void print_header(const cmd_params & params) override { + void print_header(const cmd_params & params) override { std::vector fields = test::get_fields(); fprintf(fout, "%s\n", join(fields, ",").c_str()); (void) params; @@ -992,7 +1070,6 @@ struct csv_printer : public printer { } }; - static std::string escape_json(const std::string & value) { std::string escaped; for (auto c : value) { @@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) { escaped += "\\\""; } else if (c == '\\') { escaped += "\\\\"; - } else if (c <= 0x1f) { + } else if (c <= 0x1f) { char buf[8]; snprintf(buf, sizeof(buf), "\\u%04x", c); escaped += buf; @@ -1033,7 +1110,8 @@ struct json_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str()); + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), + format_json_value(fields.at(i), values.at(i)).c_str()); } } @@ -1051,12 +1129,9 @@ struct json_printer : public printer { fflush(fout); } - void print_footer() override { - fprintf(fout, "\n]\n"); - } + void print_footer() override { fprintf(fout, "\n]\n"); } }; - struct jsonl_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); @@ -1116,7 +1191,7 @@ struct markdown_printer : public printer { return 13; } - int width = std::max((int)field.length(), 10); + int width = std::max((int) field.length(), 10); if (test::get_field_type(field) == test::STRING) { return -width; @@ -1230,18 +1305,18 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; - char buf[128]; + char buf[128]; if (field == "model") { value = t.model_type; } else if (field == "size") { - if (t.model_size < 1024*1024*1024) { + if (t.model_size < 1024 * 1024 * 1024) { snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); } else { snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); } value = buf; } else if (field == "params") { - if (t.model_n_params < 1000*1000*1000) { + if (t.model_n_params < 1000 * 1000 * 1000) { snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); } else { snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); @@ -1303,7 +1378,8 @@ struct sql_printer : public printer { std::vector fields = test::get_fields(); fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), + i < fields.size() - 1 ? "," : ""); } fprintf(fout, ");\n"); fprintf(fout, "\n"); @@ -1324,8 +1400,8 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); std::vector tokens(n_batch); @@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } @@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th static void test_gen(llama_context * ctx, int n_gen, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; @@ -1411,7 +1487,7 @@ int main(int argc, char ** argv) { set_process_priority(params.prio); // initialize printer - std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); if (p) { @@ -1426,13 +1502,13 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); - llama_model * lmodel = nullptr; + llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; - int params_idx = 0; + int params_idx = 0; auto params_count = params_instances.size(); for (const auto & inst : params_instances) { - params_idx ++; + params_idx++; if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); } @@ -1475,7 +1551,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1505,13 +1581,15 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 374ed47ad..7c4ce4be2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } -static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const std::vector & input_tokens, const std::string & output, - const std::vector & output_tokens -) { - if (params.logdir.empty()) { - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: main\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Generation Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_string_multiline(logfile, "output", output.c_str()); - yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -115,7 +72,6 @@ static void sigint_handler(int signo) { console::cleanup(); LOG("\n"); common_perf_print(*g_ctx, *g_smpl); - write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); @@ -926,7 +882,6 @@ int main(int argc, char ** argv) { LOG("\n\n"); common_perf_print(ctx, smpl); - write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); common_sampler_free(smpl); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index e803ff143..64a84607c 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -34,55 +34,6 @@ struct results_log_softmax { float prob; }; -static void write_logfile( - const llama_context * ctx, const common_params & params, const llama_model * model, - const struct results_perplexity & results -) { - if (params.logdir.empty()) { - return; - } - - if (params.hellaswag) { - LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: main\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Perplexity Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_vector_float(logfile, "logits", results.logits); - fprintf(logfile, "ppl_value: %f\n", results.ppl_value); - yaml_dump_vector_float(logfile, "probs", results.probs); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - static std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; @@ -2072,8 +2023,6 @@ int main(int argc, char ** argv) { LOG("\n"); llama_perf_context_print(ctx); - write_logfile(ctx, params, model, results); - llama_free(ctx); llama_free_model(model); diff --git a/examples/server/README.md b/examples/server/README.md index 6f72c6bb8..0936e0b7b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -85,7 +85,6 @@ The project is under active development, and we are [looking for feedback and co | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | -| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | | `--log-disable` | Log disable | | `--log-file FNAME` | Log to file | | `--log-colors` | Enable colored logging
(env: LLAMA_LOG_COLORS) | diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 65a915d59..6216c0841 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -212,6 +212,9 @@
Other sampler settings
+ + + @@ -231,6 +234,7 @@ Advanced config
@@ -253,7 +257,7 @@