From f35726c2fb0a824246e004ab4bedcde37f3f0dd0 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 25 Jan 2025 20:10:03 -0600 Subject: [PATCH 01/24] build: apply MSVC /bigobj option to c/cpp files only (#11423) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7f520582..2f2b1a201 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,7 +50,8 @@ endif() if (MSVC) add_compile_options("$<$:/utf-8>") add_compile_options("$<$:/utf-8>") - add_compile_options(/bigobj) + add_compile_options("$<$:/bigobj>") + add_compile_options("$<$:/bigobj>") endif() # From 2cc9b8c32c78d09cd1b4df0aaa605ab2d0176243 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 14:30:15 +0200 Subject: [PATCH 02/24] readme : update hot topics --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 97d028670..ff8536773 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics +- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim - Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123 From 1d8ee06000ecdd274e7f0a0465d6bf26ad2b3491 Mon Sep 17 00:00:00 2001 From: Frank Mai Date: Sun, 26 Jan 2025 23:20:34 +0800 Subject: [PATCH 03/24] rpc: fix register position (#11424) Signed-off-by: thxCode --- src/llama-model.cpp | 2 ++ src/llama.cpp | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 031b4c30b..18bd0b071 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1303,10 +1303,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { + LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev)); return {cpu_dev, &pimpl->cpu_buft_list}; } const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); auto * dev = devices.at(layer_gpu); + LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev)); return {dev, &pimpl->gpu_buft_list.at(dev)}; }; diff --git a/src/llama.cpp b/src/llama.cpp index e8cfe5012..094157ccf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9405,6 +9405,7 @@ static struct llama_model * llama_model_load_from_file_impl( model->devices.push_back(*dev); } } else { + std::vector rpc_servers; // use all available devices for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); @@ -9415,10 +9416,19 @@ static struct llama_model * llama_model_load_from_file_impl( break; case GGML_BACKEND_DEVICE_TYPE_GPU: - model->devices.push_back(dev); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (ggml_backend_reg_name(reg) == std::string("RPC")) { + rpc_servers.push_back(dev); + } else { + model->devices.push_back(dev); + } break; } } + // add RPC servers at the front of the list + if (!rpc_servers.empty()) { + model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); + } } // if using single GPU mode, remove all except the main GPU From 19f65187cbf009801288861133267ee5573ceead Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Sun, 26 Jan 2025 12:07:48 -0400 Subject: [PATCH 04/24] cmake: add ggml find package (#11369) * Add initial ggml cmake package * Add build numbers to ggml find-package * Expand variables with GGML_ prefix * Guard against adding to cache variable twice * Add git to msys2 workflow * Handle ggml-cpu-* variants * Link ggml/ggml-base libraries to their targets * Replace main-cmake-pkg with simple-cmake-pkg * Interface features require c_std_90 * Fix typo * Removed unnecessary bracket from status message * Update examples/simple-cmake-pkg/README.md Co-authored-by: Georgi Gerganov * Update examples/simple-cmake-pkg/README.md Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 1 + CMakeLists.txt | 23 +-- cmake/llama-config.cmake.in | 156 +----------------- examples/main-cmake-pkg/CMakeLists.txt | 32 ---- examples/main-cmake-pkg/README.md | 31 ---- .../.gitignore | 0 examples/simple-cmake-pkg/CMakeLists.txt | 11 ++ examples/simple-cmake-pkg/README.md | 34 ++++ ggml/CMakeLists.txt | 71 ++++++++ ggml/cmake/ggml-config.cmake.in | 147 +++++++++++++++++ ggml/src/CMakeLists.txt | 11 ++ 11 files changed, 284 insertions(+), 233 deletions(-) delete mode 100644 examples/main-cmake-pkg/CMakeLists.txt delete mode 100644 examples/main-cmake-pkg/README.md rename examples/{main-cmake-pkg => simple-cmake-pkg}/.gitignore (100%) create mode 100644 examples/simple-cmake-pkg/CMakeLists.txt create mode 100644 examples/simple-cmake-pkg/README.md create mode 100644 ggml/cmake/ggml-config.cmake.in diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 37cb6b1e7..cd8422f8a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -613,6 +613,7 @@ jobs: msystem: ${{matrix.sys}} install: >- base-devel + git mingw-w64-${{matrix.env}}-toolchain mingw-w64-${{matrix.env}}-cmake mingw-w64-${{matrix.env}}-openblas diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f2b1a201..4c62d1788 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,27 +188,14 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") -# At the moment some compile definitions are placed within the ggml/src -# directory but not exported on the `ggml` target. This could be improved by -# determining _precisely_ which defines are necessary for the llama-config -# package. -# -set(GGML_TRANSIENT_DEFINES) -get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) -get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) -if (GGML_DIR_DEFINES) - list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES}) -endif() -get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) -if (GGML_TARGET_DEFINES) - list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) -endif() -get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) -# all public headers set(LLAMA_PUBLIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) -set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") + +set_target_properties(llama + PROPERTIES + PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") + install(TARGETS llama LIBRARY PUBLIC_HEADER) configure_package_config_file( diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 5c55bc6b8..40ade96e5 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,159 +3,13 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) -set(GGML_STATIC @GGML_STATIC@) -set(GGML_NATIVE @GGML_NATIVE@) -set(GGML_LTO @GGML_LTO@) -set(GGML_CCACHE @GGML_CCACHE@) -set(GGML_AVX @GGML_AVX@) -set(GGML_AVX2 @GGML_AVX2@) -set(GGML_AVX512 @GGML_AVX512@) -set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@) -set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@) -set(GGML_AVX512_BF16 @GGML_AVX512_BF16@) -set(GGML_AMX_TILE @GGML_AMX_TILE@) -set(GGML_AMX_INT8 @GGML_AMX_INT8@) -set(GGML_AMX_BF16 @GGML_AMX_BF16@) -set(GGML_FMA @GGML_FMA@) -set(GGML_LASX @GGML_LASX@) -set(GGML_LSX @GGML_LSX@) -set(GGML_RVV @GGML_RVV@) -set(GGML_SVE @GGML_SVE@) - -set(GGML_ACCELERATE @GGML_ACCELERATE@) -set(GGML_OPENMP @GGML_OPENMP@) -set(GGML_CPU_HBM @GGML_CPU_HBM@) -set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@) - -set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@) -set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@) -set(GGML_CUDA_F16 @GGML_CUDA_F16@) -set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@) -set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@) -set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@) -set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@) -set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@) - -set(GGML_HIP_UMA @GGML_HIP_UMA@) - -set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) -set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) -set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) -set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@) -set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@) -set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) -set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@) - -set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@) -set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@) -set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@) -set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@) -set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@) -set(GGML_METAL_STD @GGML_METAL_STD@) - -set(GGML_SYCL_F16 @GGML_SYCL_F16@) -set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@) -set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@) - - @PACKAGE_INIT@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") -find_package(Threads REQUIRED) - -set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") -set(_llama_link_deps "") -set(_llama_link_opts "") -foreach(_ggml_lib ggml ggml-base) - string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") - find_library(${_ggml_lib_var} ${_ggml_lib} - REQUIRED - HINTS ${LLAMA_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH - ) - list(APPEND _llama_link_deps "${${_ggml_lib_var}}") - message(STATUS "Found ${${_ggml_lib_var}}") -endforeach() - -foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan) - string(TOUPPER "GGML_${backend}" backend_id) - set(_ggml_lib "ggml-${backend}") - string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") - - find_library(${_ggml_lib_var} ${_ggml_lib} - HINTS ${LLAMA_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH - ) - if(${_ggml_lib_var}) - list(APPEND _llama_link_deps "${${_ggml_lib_var}}") - set(${backend_id} ON) - message(STATUS "Found backend ${${_ggml_lib_var}}") - else() - set(${backend_id} OFF) - endif() -endforeach() - -if (NOT LLAMA_SHARED_LIB) - if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) - list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK}) - endif() - - if (GGML_OPENMP) - find_package(OpenMP REQUIRED) - list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - endif() - - if (GGML_CPU_HBM) - find_library(memkind memkind REQUIRED) - list(APPEND _llama_link_deps memkind) - endif() - - if (GGML_BLAS) - find_package(BLAS REQUIRED) - list(APPEND _llama_link_deps ${BLAS_LIBRARIES}) - list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS}) - endif() - - if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) - endif() - - if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY} - ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) - endif() - - if (GGML_VULKAN) - find_package(Vulkan REQUIRED) - list(APPEND _llama_link_deps Vulkan::Vulkan) - endif() - - if (GGML_HIP) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas) - endif() - - if (GGML_SYCL) - find_package(DNNL) - if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") - list(APPEND _llama_link_deps DNNL::dnnl) - endif() - if (WIN32) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) - list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) - endif() - endif() -endif() +find_package(ggml REQUIRED) find_library(llama_LIBRARY llama REQUIRED @@ -167,12 +21,10 @@ add_library(llama UNKNOWN IMPORTED) set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" - INTERFACE_LINK_OPTIONS "${_llama_link_opts}" - INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" + INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LOCATION "${llama_LIBRARY}" - INTERFACE_COMPILE_FEATURES cxx_std_11 - POSITION_INDEPENDENT_CODE ON ) + INTERFACE_COMPILE_FEATURES c_std_90 + POSITION_INDEPENDENT_CODE ON) check_required_components(Llama) diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt deleted file mode 100644 index 5563f4de0..000000000 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -cmake_minimum_required(VERSION 3.12) -project("llama-cli-cmake-pkg" C CXX) -set(TARGET llama-cli-cmake-pkg) - -find_package(Llama 0.0.1 REQUIRED) - -# Bake common functionality in with target. Because applications -# using the relocatable Llama package should be outside of the -# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in. -set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") -add_library(common OBJECT) -file(GLOB _common_files - "${_common_path}/*.h" - "${_common_path}/*.cpp" -) -target_sources(common PRIVATE ${_common_files}) - -# If the common project was part of "llama-cli-cmake-pkg" the transient -# defines would automatically be attached. Because the common func- -# tionality is separate, but dependent upon the defines, it must be -# explicitly extracted from the "llama" target. -# -get_target_property(_llama_transient_defines llama - INTERFACE_COMPILE_DEFINITIONS) - -target_compile_definitions(common PRIVATE "${_llama_transient_defines}") - -add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) -target_include_directories(${TARGET} PRIVATE ${_common_path}) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md deleted file mode 100644 index 08d83dd08..000000000 --- a/examples/main-cmake-pkg/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# llama.cpp/example/main-cmake-pkg - -This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. - -## Building - -Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions. - -### Considerations - -When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package. - -### Build llama.cpp and install to C:\LlamaCPP directory - -```cmd -git clone https://github.com/ggerganov/llama.cpp -cd llama.cpp -cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64 -cmake --build build --config Release -cmake --install build --prefix C:/LlamaCPP -``` - -### Build llama-cli-cmake-pkg - - -```cmd -cd ..\examples\main-cmake-pkg -cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 -cmake --build build --config Release -cmake --install build --prefix C:/MyLlamaApp -``` diff --git a/examples/main-cmake-pkg/.gitignore b/examples/simple-cmake-pkg/.gitignore similarity index 100% rename from examples/main-cmake-pkg/.gitignore rename to examples/simple-cmake-pkg/.gitignore diff --git a/examples/simple-cmake-pkg/CMakeLists.txt b/examples/simple-cmake-pkg/CMakeLists.txt new file mode 100644 index 000000000..128e38c8f --- /dev/null +++ b/examples/simple-cmake-pkg/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.12) +project(llama-simple-cmake-pkg) + +set(TARGET llama-simple-cmake-pkg) + +find_package(Llama REQUIRED) + +add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple-cmake-pkg/README.md b/examples/simple-cmake-pkg/README.md new file mode 100644 index 000000000..8b30049e2 --- /dev/null +++ b/examples/simple-cmake-pkg/README.md @@ -0,0 +1,34 @@ +# llama.cpp/example/simple-cmake-pkg + +This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. + +## Building + +Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions. + +### Considerations + +When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package + +### Build llama.cpp and install to llama.cpp/inst + +```sh +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +cmake -S . -B build +cmake --build build +cmake --install build --prefix inst + +### Build simple-cmake-pkg + +```sh +cd examples/simple-cmake-pkg +cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake +cmake --build build +``` + +### Run simple-cmake-pkg + +```sh +./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" +``` diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index bbabb14de..7c069e420 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -267,3 +267,74 @@ if (GGML_STANDALONE) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc DESTINATION share/pkgconfig) endif() + +# +# Create CMake package +# + +# Generate version info based on git commit. + +find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH) +execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GGML_BUILD_NUMBER + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +if(GGML_BUILD_NUMBER EQUAL 1) + message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.") +endif() + +execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GGML_BUILD_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Capture variables prefixed with GGML_. + +set(variable_set_statements +" +####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() ####### +####### Any changes to this file will be overwritten by the next CMake run ####### + +") + +set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS}) + +get_cmake_property(all_variables VARIABLES) +foreach(variable_name IN LISTS all_variables) + if(variable_name MATCHES "^GGML_") + string(REPLACE ";" "\\;" + variable_value "${${variable_name}}") + + set(variable_set_statements + "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n") + endif() +endforeach() + +set(GGML_VARIABLES_EXPANDED ${variable_set_statements}) + +# Create the CMake package and set install location. + +set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER}) +set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") +set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") +set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") + +configure_package_config_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml + PATH_VARS GGML_INCLUDE_INSTALL_DIR + GGML_LIB_INSTALL_DIR + GGML_BIN_INSTALL_DIR) + +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake + VERSION ${GGML_INSTALL_VERSION} + COMPATIBILITY SameMajorVersion) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in new file mode 100644 index 000000000..bf39f9c00 --- /dev/null +++ b/ggml/cmake/ggml-config.cmake.in @@ -0,0 +1,147 @@ + +@GGML_VARIABLES_EXPANDED@ + +@PACKAGE_INIT@ + +set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@") +set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@") +set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@") + +find_package(Threads REQUIRED) + +find_library(GGML_LIBRARY ggml + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) + +add_library(ggml::ggml UNKNOWN IMPORTED) +set_target_properties(ggml::ggml + PROPERTIES + IMPORTED_LOCATION "${GGML_LIBRARY}") + +find_library(GGML_BASE_LIBRARY ggml-base + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) + +add_library(ggml::ggml-base UNKNOWN IMPORTED) +set_target_properties(ggml::ggml-base + PROPERTIES + IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") + +if (NOT GGML_SHARED_LIB) + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK}) + endif() + + if (GGML_OPENMP) + find_package(OpenMP REQUIRED) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + endif() + + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind) + endif() + + if (GGML_BLAS) + find_package(BLAS REQUIRED) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES}) + list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS}) + endif() + + if (GGML_CUDA) + find_package(CUDAToolkit REQUIRED) + endif() + + if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + + list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES + ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + endif() + + if (GGML_VULKAN) + find_package(Vulkan REQUIRED) + list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan) + endif() + + if (GGML_HIP) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas) + endif() + + if (GGML_SYCL) + find_package(DNNL) + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl) + endif() + if (WIN32) + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + endif() + endif() +endif() + +set(_ggml_all_targets "") +foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) + string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") + string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx) + + find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend} + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) + + message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}") + + add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}" + INTERFACE_COMPILE_FEATURES c_std_90 + POSITION_INDEPENDENT_CODE ON) + + string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}") + if(is_cpu_variant) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base") + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}") + + if(GGML_CPU_INTERFACE_LINK_OPTIONS) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}") + endif() + + else() + list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base") + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}") + + if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}") + endif() + endif() + + list(APPEND _ggml_all_targets ggml::${_ggml_backend}) +endforeach() + +add_library(ggml::all INTERFACE IMPORTED) +set_target_properties(ggml::all + PROPERTIES + INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}") + +check_required_components(ggml) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ae1cd2337..8d2b948fb 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -250,6 +250,17 @@ function(ggml_add_backend_library backend) target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) endif() + + if(NOT GGML_AVAILABLE_BACKENDS) + set(GGML_AVAILABLE_BACKENDS "${backend}" + CACHE INTERNAL "List of backends for cmake package") + else() + list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend) + if(has_backend EQUAL -1) + set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}" + CACHE INTERNAL "List of backends for cmake package") + endif() + endif() endfunction() function(ggml_add_backend backend) From 6f53d8a6b41e48c73b345fc6c712c3b00ea4fb93 Mon Sep 17 00:00:00 2001 From: Nuno Date: Sun, 26 Jan 2025 18:22:43 +0100 Subject: [PATCH 05/24] docker: add missing vulkan library to base layer and update to 24.04 (#11422) Signed-off-by: rare-magma --- .devops/vulkan.Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index cfc2162e3..ad5dcd374 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,4 @@ -ARG UBUNTU_VERSION=jammy +ARG UBUNTU_VERSION=24.04 FROM ubuntu:$UBUNTU_VERSION AS build @@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget # Install Vulkan SDK and cURL RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \ apt update -y && \ apt-get install -y vulkan-sdk libcurl4-openssl-dev curl @@ -34,7 +34,7 @@ RUN mkdir -p /app/full \ FROM ubuntu:$UBUNTU_VERSION AS base RUN apt-get update \ - && apt-get install -y libgomp1 curl\ + && apt-get install -y libgomp1 curl libvulkan-dev \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ From 178a7eb952d211b8d4232d5e50ae1b64519172a9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 20:06:16 +0200 Subject: [PATCH 06/24] metal : use residency sets (#11427) * metal : use residency sets ggml-ci * metal : restore commandBufferWithUnretainedReferences calls [no ci] * metal : release descriptors ggml-ci * metal : check env GGML_METAL_NO_RESIDENCY ggml-ci * metal : fix build + clean-up ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 136 +++++++++++++++++++++++++++---- 1 file changed, 119 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a85502ee0..c9474345d 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -19,7 +19,10 @@ // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 -#define UNUSED(x) (void)(x) +// create residency sets only on macOS >= 15.0 +#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 +#define GGML_METAL_HAS_RESIDENCY_SETS 1 +#endif // globals @@ -39,6 +42,7 @@ static struct ggml_backend_metal_device_context { bool has_simdgroup_reduction; bool has_simdgroup_mm; + bool has_residency_sets; bool has_bfloat; bool use_bfloat; @@ -48,6 +52,7 @@ static struct ggml_backend_metal_device_context { /*.mtl_device_ref_count =*/ 0, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, + /*.has_residency_sets =*/ false, /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, /*.name =*/ "", @@ -65,6 +70,10 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; +#if defined(GGML_METAL_HAS_RESIDENCY_SETS) + ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL; +#endif + ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; @@ -483,6 +492,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->queue = [device newCommandQueue]; + if (ctx->queue == nil) { + GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); + return NULL; + } + ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); id metal_library; @@ -649,6 +663,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); + GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); @@ -1035,8 +1050,70 @@ struct ggml_backend_metal_buffer_context { // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; + + // optional MTLResidencySet + id rset; }; +// rset init +static bool ggml_backend_metal_buffer_rset_init( + struct ggml_backend_metal_buffer_context * ctx, + struct ggml_backend_metal_device_context * ctx_dev, + id device) { + ctx->rset = nil; + + if (!ctx_dev->has_residency_sets) { + return true; + } + +#if defined(GGML_METAL_HAS_RESIDENCY_SETS) + if (@available(macOS 15.0, *)) { + MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init]; + desc.label = @"ggml_backend_metal"; + desc.initialCapacity = ctx->n_buffers; + + NSError * error; + ctx->rset = [device newResidencySetWithDescriptor:desc error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + [desc release]; + return false; + } + + [desc release]; + + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->rset addAllocation:ctx->buffers[i].metal]; + } + + [ctx->rset commit]; + [ctx->rset requestResidency]; + + return true; + } +#else + GGML_UNUSED(ctx_dev); + GGML_UNUSED(device); +#endif + + return true; +} + +// rset free +static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) { +#if defined(GGML_METAL_HAS_RESIDENCY_SETS) + if (@available(macOS 15.0, *)) { + if (ctx->rset) { + [ctx->rset endResidency]; + [ctx->rset removeAllAllocations]; + [ctx->rset release]; + } + } +#else + GGML_UNUSED(ctx); +#endif +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) for (int i = 0; i < ctx->n_buffers; i++) { [ctx->buffers[i].metal release]; } + + ggml_backend_metal_buffer_rset_free(ctx); ggml_backend_metal_device_rel(buffer->buft->device->context); if (ctx->owned) { @@ -4198,19 +4277,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { memset((char *)tensor->data + offset, value, size); - UNUSED(buffer); + GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); - UNUSED(buffer); + GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); - UNUSED(buffer); + GGML_UNUSED(buffer); } static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { @@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c } return false; - UNUSED(buffer); + GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -4246,7 +4325,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal"; - UNUSED(buft); + GGML_UNUSED(buft); } static void ggml_backend_metal_log_allocated_size(id device, size_t size_aligned) { @@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s } #endif #endif - UNUSED(device); - UNUSED(size_aligned); + GGML_UNUSED(device); + GGML_UNUSED(size_aligned); } static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_device_acq(buft->device->context); + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context; + id device = ggml_backend_metal_device_acq(ctx_dev); ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_size = size_aligned; @@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); - ggml_backend_metal_device_rel(buft->device->context); + ggml_backend_metal_device_rel(ctx_dev); + return NULL; + } + + if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { + GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); + free(ctx); + ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; - UNUSED(buft); + GGML_UNUSED(buft); } static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { @@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty return max_size; - UNUSED(buft); + GGML_UNUSED(buft); } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; - UNUSED(buft); + GGML_UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { @@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal_Mapped"; - UNUSED(buft); + GGML_UNUSED(buft); } static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { @@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); + struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; + id device = ggml_backend_metal_device_acq(ctx_dev); // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } } + if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { + GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); + free(ctx); + ggml_backend_metal_device_rel(ctx_dev); + return NULL; + } + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } @@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; - UNUSED(backend); + GGML_UNUSED(backend); } static void ggml_backend_metal_free(ggml_backend_t backend) { @@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } } + if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { + GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); + free(ctx); + ggml_backend_metal_device_rel(ctx_dev); + return NULL; + } + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } @@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; - UNUSED(dev); + GGML_UNUSED(dev); } static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { From caf773f249aa267c78d3da5567b8ab156080ea59 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 26 Jan 2025 22:45:32 +0100 Subject: [PATCH 07/24] docker : fix ARM build and Vulkan build (#11434) * ci : do not fail-fast for docker * build arm64/amd64 separatedly * fix pip * no fast fail * vulkan: try jammy --- .devops/vulkan.Dockerfile | 4 ++-- .github/workflows/docker.yml | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index ad5dcd374..b5bd3b6d2 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,4 @@ -ARG UBUNTU_VERSION=24.04 +ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build @@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget # Install Vulkan SDK and cURL RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ apt update -y && \ apt-get install -y vulkan-sdk libcurl4-openssl-dev curl diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d71f1eb38..6bf22eb66 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -32,10 +32,12 @@ jobs: env: COMMIT_SHA: ${{ github.sha }} strategy: + fail-fast: false matrix: config: # Multi-stage build - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, freediskspace: false} - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} From acd38efee316f3a5ed2e6afcbc5814807c347053 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Mon, 27 Jan 2025 02:41:59 -0500 Subject: [PATCH 08/24] metal: Handle null returned from MTLCreateSystemDefaultDevice() (#11441) This fixes segmentation fault error when running tests when no metal devices are available (for example, when not linked with Core Graphics framework or otherwise). --- ggml/src/ggml-metal/ggml-metal.m | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index c9474345d..76f8e4291 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -64,7 +64,9 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); + } + if (ctx->mtl_device) { ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -99,8 +101,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_device_ref_count--; if (ctx->mtl_device_ref_count == 0) { - [ctx->mtl_device release]; - ctx->mtl_device = nil; + if (ctx->mtl_device) { + [ctx->mtl_device release]; + ctx->mtl_device = nil; + } } } From df984e014714cba4c99ef894b20b51cbcef31b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 27 Jan 2025 12:07:12 +0100 Subject: [PATCH 09/24] llama: refactor llama_decode_impl (#11381) --- src/llama.cpp | 243 +++++++++++++++++++++++++++++--------------------- 1 file changed, 140 insertions(+), 103 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 094157ccf..12e8f41fc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8432,13 +8432,141 @@ static enum ggml_status llama_graph_compute( return status; } +static int llama_prepare_sbatch( + llama_context & lctx, + const llama_batch & batch, + uint32_t & n_outputs) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const uint32_t n_tokens_all = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + if (batch.token) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } + } + } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); + GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); + + lctx.n_queued_tokens += n_tokens_all; + lctx.embd_seq.clear(); + + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs += batch.logits[i] != 0; + } + } else if (lctx.logits_all || embd_pooled) { + n_outputs = n_tokens_all; + } else { + // keep last output only + n_outputs = 1; + } + + lctx.sbatch.from_batch(batch, n_embd, + /* simple_split */ !lctx.kv_self.recurrent, + /* logits_all */ n_outputs == n_tokens_all); + + // reserve output buffer + if (llama_output_reserve(lctx, n_outputs) < n_outputs) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); + return -2; + }; + + return 0; +} + +static int llama_prepare_ubatch( + llama_context & lctx, + llama_kv_slot_restorer & kv_slot_restorer, + llama_ubatch & ubatch, + const uint32_t n_outputs, + const uint32_t n_tokens_all) { + GGML_ASSERT(lctx.sbatch.n_tokens > 0); + + auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + if (lctx.kv_self.recurrent) { + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + ubatch = lctx.sbatch.split_seq(cparams.n_ubatch); + } else { + // recurrent model architectures are easier to implement + // with equal-length sequences + ubatch = lctx.sbatch.split_equal(cparams.n_ubatch); + } + } else { + ubatch = lctx.sbatch.split_simple(cparams.n_ubatch); + } + + // count the outputs in this u_batch + { + int32_t n_outputs_new = 0; + + if (n_outputs == n_tokens_all) { + n_outputs_new = ubatch.n_tokens; + } else { + GGML_ASSERT(ubatch.output); + for (uint32_t i = 0; i < ubatch.n_tokens; i++) { + n_outputs_new += int32_t(ubatch.output[i] != 0); + } + } + + // needs to happen before the graph is built + lctx.n_outputs = n_outputs_new; + } + + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + llama_kv_cache_update(&lctx); + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { + kv_self.head = 0; + } + + const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); + if (!slot) { + return 1; + } + kv_slot_restorer.save(slot); + + if (!kv_self.recurrent) { + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + const uint32_t pad = llama_kv_cache_get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); + } + } + + return 0; +} + // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state // (for non-recurrent models) or cleaned (for recurrent models) // // - lctx: llama context -// - batch: batch to evaluate +// - inp_batch: batch to evaluate // // return 0 on success // return positive int on warning @@ -8455,37 +8583,18 @@ static int llama_decode_impl( return -1; } - // temporary allocate memory for the input batch if needed + // temporarily allocate memory for the input batch if needed llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); - const llama_batch & batch = batch_allocr.batch; - const uint32_t n_tokens_all = batch.n_tokens; const auto & model = lctx.model; const auto & vocab = model.vocab; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - - if (batch.token) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - } - } - - GGML_ASSERT(n_tokens_all <= cparams.n_batch); - - GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); - if (lctx.t_compute_start_us == 0) { lctx.t_compute_start_us = ggml_time_us(); } - lctx.n_queued_tokens += n_tokens_all; - auto & kv_self = lctx.kv_self; llama_kv_slot_restorer kv_slot_restorer(kv_self); @@ -8495,99 +8604,27 @@ static int llama_decode_impl( uint32_t n_outputs = 0; uint32_t n_outputs_prev = 0; - const auto n_ubatch = cparams.n_ubatch; - - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - - lctx.embd_seq.clear(); - - // count outputs - if (batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs += batch.logits[i] != 0; + { + const int ret = llama_prepare_sbatch(lctx, batch, n_outputs); + if (ret != 0) { + return ret; } - } else if (lctx.logits_all || embd_pooled) { - n_outputs = n_tokens_all; - } else { - // keep last output only - n_outputs = 1; } - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ n_outputs == n_tokens_all); - - // reserve output buffer - if (llama_output_reserve(lctx, n_outputs) < n_outputs) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); - return -2; - }; - while (lctx.sbatch.n_tokens > 0) { llama_ubatch ubatch; - if (kv_self.recurrent) { - if (embd_pooled) { - // Pooled embeddings cannot be split across ubatches (yet) - ubatch = lctx.sbatch.split_seq(n_ubatch); - } else { - // recurrent model architectures are easier to implement - // with equal-length sequences - ubatch = lctx.sbatch.split_equal(n_ubatch); - } - } else { - ubatch = lctx.sbatch.split_simple(n_ubatch); - } - const uint32_t n_tokens = ubatch.n_tokens; - - // count the outputs in this u_batch { - int32_t n_outputs_new = 0; - - if (n_outputs == n_tokens_all) { - n_outputs_new = n_tokens; - } else { - GGML_ASSERT(ubatch.output); - for (uint32_t i = 0; i < n_tokens; i++) { - n_outputs_new += (int32_t) (ubatch.output[i] != 0); - } + const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens); + if (ret != 0) { + return ret; } - - // needs to happen before the graph is built - lctx.n_outputs = n_outputs_new; } - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; + const int n_threads = ubatch.n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + ggml_threadpool_t threadpool = ubatch.n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; GGML_ASSERT(n_threads > 0); - // non-causal masks do not use the KV cache - if (hparams.causal_attn) { - llama_kv_cache_update(&lctx); - - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*n_tokens) { - kv_self.head = 0; - } - - const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); - if (!slot) { - return 1; - } - kv_slot_restorer.save(slot); - - if (!kv_self.recurrent) { - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = llama_kv_cache_get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); - } - } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched.get()); @@ -8640,7 +8677,7 @@ static int llama_decode_impl( // update the kv ring buffer { - kv_self.head += n_tokens; + kv_self.head += ubatch.n_tokens; // Ensure kv cache head points to a valid index. if (kv_self.head >= kv_self.size) { From a5203b4465c5c87813936bde98170e25bb09024f Mon Sep 17 00:00:00 2001 From: lexasub Date: Mon, 27 Jan 2025 17:42:09 +0400 Subject: [PATCH 10/24] llama : minor fixes for up llama load model speed (#11448) * impl::load change map bpe_ranks to onordered map for reduce time of impl::load on 30% * llama_model_loader::init_mapping - replace new llama_mmap to std::make_unique for clean code & reduce (/2) time of running init_mappings * Update src/llama-vocab.cpp --------- Co-authored-by: lexasub Co-authored-by: Diego Devesa --- src/llama-model-loader.cpp | 2 +- src/llama-vocab.cpp | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf61..05d58ad90 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -819,7 +819,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps for (const auto & file : files) { auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); + std::unique_ptr mapping = std::make_unique(file.get(), prefetch ? -1 : 0, is_numa_fn()); mmaps_used.emplace_back(mapping->size(), 0); if (mlock_mmaps) { std::unique_ptr mlock_mmap(new llama_mlock()); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0782d3a41..561f8bdb8 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1245,8 +1245,13 @@ struct llama_vocab::impl { std::vector cache_special_tokens; std::vector cache_token_to_piece; // llama_token_to_piece(special = true); - - std::map, int> bpe_ranks; + struct pair_hash { + size_t operator()(const std::pair & p) const { + return std::hash{}(p.first) ^ //create some hash for pair + (std::hash{}(p.second) << 1); + } + }; + std::unordered_map, int, pair_hash> bpe_ranks; // set of all tokens that cause "end of generation" std::set special_eog_ids; From d6d24cd9ed6d0b9558643dcc28f2124bef488c52 Mon Sep 17 00:00:00 2001 From: Haus1 Date: Mon, 27 Jan 2025 08:58:17 -0500 Subject: [PATCH 11/24] AMD: parse the architecture as supplied by gcnArchName (#11244) The value provided by minor doesn't include stepping for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID. --- ggml/src/ggml-cuda/common.cuh | 20 +++++----- ggml/src/ggml-cuda/ggml-cuda.cu | 67 ++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index bb6120568..a66322da0 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -46,20 +46,20 @@ #define GGML_CUDA_CC_VOLTA 700 #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 -#define GGML_CUDA_CC_OFFSET_AMD 1000000 +#define GGML_CUDA_CC_OFFSET_AMD 0x1000000 // GCN/CNDA, wave size is 64 -#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 -#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue -#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a -#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers -#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing -#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300 +#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16 +#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue +#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a +#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers +#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing +#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 -#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000 -#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a -#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA +#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 +#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a +#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA #define GGML_CUDA_CC_QY1 210 #define GGML_CUDA_CC_QY2 220 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 85178abd2..402f37e85 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -119,6 +119,55 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) #endif } +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +static int ggml_cuda_parse_id(char devName[]) { + // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp + // these values are not stable so this is susceptible to breakage + // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp + int archMajor = 0x0; + int archMinor = 0x0; + int archNum = GGML_CUDA_CC_OFFSET_AMD; + int archLen = strlen(devName); + char archName[archLen + 1]; + + // strip leading 'gfx' while copying into our buffer + if (archLen > 3) { + strcpy(archName, &devName[3]); + archLen -= 3; + } + + // trim trailing :xnack- or :sramecc- statuses + archLen = strcspn(archName, ":"); + archName[archLen] = '\0'; + + // tease out the version information + if (archLen > 8) { + // versions labeled generic use '-' as delimiter + // strip the trailing "-generic" then iterate through what remains + if ((strstr(archName, "-generic"))) { + archName[archLen - 8] = '\0'; + char * pch; + if ((pch = strtok(archName, "-"))) { + archMajor = (int)strtoul(pch, 0, 16); + if ((pch = strtok(NULL, "-"))) { + archMinor = 0x10 * (int)strtoul(pch, 0, 16); + } + } + } + } else if (archLen >= 3) { + // last two digits should be the minor * 0x10 + stepping + archMinor = (int)strtoul(&archName[archLen - 2], 0, 16); + archName[archLen - 2] = '\0'; + + // only the major version remains + archMajor = (int)strtoul(archName, 0, 16); + } + archNum += archMajor * 0x100; + archNum += archMinor; + return archNum; +} +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + static ggml_cuda_device_info ggml_cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: @@ -169,7 +218,6 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -178,10 +226,25 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].smpb = prop.sharedMemPerBlock; #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) info.devices[id].smpbo = prop.sharedMemPerBlock; - info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD; + + info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName); + if ((info.devices[id].cc & 0xff00) == 0x0) { + GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n", + id, prop.name, prop.gcnArchName, prop.major, prop.minor); + + // Fallback to prop.major and prop.minor + if (prop.major > 0) { + info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100; + info.devices[id].cc += prop.minor * 0x10; + } + } + GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n", + id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no"); #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } From a4417ddda98fd0558fb4d802253e68a933704b59 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Mon, 27 Jan 2025 19:36:10 +0100 Subject: [PATCH 12/24] Add new hf protocol for ollama (#11449) https://huggingface.co/docs/hub/en/ollama Signed-off-by: Eric Curtin --- examples/run/run.cpp | 121 ++++++++++++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 41 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 92a49eb74..8a0db74b6 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -319,6 +319,10 @@ class HttpClient { public: int init(const std::string & url, const std::vector & headers, const std::string & output_file, const bool progress, std::string * response_str = nullptr) { + if (std::filesystem::exists(output_file)) { + return 0; + } + std::string output_file_partial; curl = curl_easy_init(); if (!curl) { @@ -558,13 +562,14 @@ class LlamaData { } sampler = initialize_sampler(opt); + return 0; } private: #ifdef LLAMA_USE_CURL - int download(const std::string & url, const std::vector & headers, const std::string & output_file, - const bool progress, std::string * response_str = nullptr) { + int download(const std::string & url, const std::string & output_file, const bool progress, + const std::vector & headers = {}, std::string * response_str = nullptr) { HttpClient http; if (http.init(url, headers, output_file, progress, response_str)) { return 1; @@ -573,48 +578,85 @@ class LlamaData { return 0; } #else - int download(const std::string &, const std::vector &, const std::string &, const bool, + int download(const std::string &, const std::string &, const bool, const std::vector & = {}, std::string * = nullptr) { printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return 1; } #endif - int huggingface_dl(const std::string & model, const std::vector headers, const std::string & bn) { - // Find the second occurrence of '/' after protocol string - size_t pos = model.find('/'); - pos = model.find('/', pos + 1); - if (pos == std::string::npos) { - return 1; - } - - const std::string hfr = model.substr(0, pos); - const std::string hff = model.substr(pos + 1); - const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; - return download(url, headers, bn, true); - } - - int ollama_dl(std::string & model, const std::vector headers, const std::string & bn) { - if (model.find('/') == std::string::npos) { - model = "library/" + model; - } - - std::string model_tag = "latest"; - size_t colon_pos = model.find(':'); + // Helper function to handle model tag extraction and URL construction + std::pair extract_model_and_tag(std::string & model, const std::string & base_url) { + std::string model_tag = "latest"; + const size_t colon_pos = model.find(':'); if (colon_pos != std::string::npos) { model_tag = model.substr(colon_pos + 1); model = model.substr(0, colon_pos); } - std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag; + std::string url = base_url + model + "/manifests/" + model_tag; + + return { model, url }; + } + + // Helper function to download and parse the manifest + int download_and_parse_manifest(const std::string & url, const std::vector & headers, + nlohmann::json & manifest) { std::string manifest_str; - const int ret = download(manifest_url, headers, "", false, &manifest_str); + int ret = download(url, "", false, headers, &manifest_str); if (ret) { return ret; } - nlohmann::json manifest = nlohmann::json::parse(manifest_str); - std::string layer; + manifest = nlohmann::json::parse(manifest_str); + + return 0; + } + + int huggingface_dl(std::string & model, const std::string & bn) { + // Find the second occurrence of '/' after protocol string + size_t pos = model.find('/'); + pos = model.find('/', pos + 1); + std::string hfr, hff; + std::vector headers = { "User-Agent: llama-cpp", "Accept: application/json" }; + std::string url; + + if (pos == std::string::npos) { + auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/"); + hfr = model_name; + + nlohmann::json manifest; + int ret = download_and_parse_manifest(manifest_url, headers, manifest); + if (ret) { + return ret; + } + + hff = manifest["ggufFile"]["rfilename"]; + } else { + hfr = model.substr(0, pos); + hff = model.substr(pos + 1); + } + + url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; + + return download(url, bn, true, headers); + } + + int ollama_dl(std::string & model, const std::string & bn) { + const std::vector headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" }; + if (model.find('/') == std::string::npos) { + model = "library/" + model; + } + + auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/"); + nlohmann::json manifest; + int ret = download_and_parse_manifest(manifest_url, {}, manifest); + if (ret) { + return ret; + } + + std::string layer; for (const auto & l : manifest["layers"]) { if (l["mediaType"] == "application/vnd.ollama.image.model") { layer = l["digest"]; @@ -622,8 +664,9 @@ class LlamaData { } } - std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer; - return download(blob_url, headers, bn, true); + std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer; + + return download(blob_url, bn, true, headers); } std::string basename(const std::string & path) { @@ -653,22 +696,18 @@ class LlamaData { return ret; } - const std::string bn = basename(model_); - const std::vector headers = { "--header", - "Accept: application/vnd.docker.distribution.manifest.v2+json" }; + const std::string bn = basename(model_); if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { rm_until_substring(model_, "://"); - ret = huggingface_dl(model_, headers, bn); + ret = huggingface_dl(model_, bn); } else if (string_starts_with(model_, "hf.co/")) { rm_until_substring(model_, "hf.co/"); - ret = huggingface_dl(model_, headers, bn); - } else if (string_starts_with(model_, "ollama://")) { - rm_until_substring(model_, "://"); - ret = ollama_dl(model_, headers, bn); + ret = huggingface_dl(model_, bn); } else if (string_starts_with(model_, "https://")) { - ret = download(model_, headers, bn, true); - } else { - ret = ollama_dl(model_, headers, bn); + ret = download(model_, bn, true); + } else { // ollama:// or nothing + rm_until_substring(model_, "://"); + ret = ollama_dl(model_, bn); } model_ = bn; From 2b8525d5c89b124c4578a2621cbeb64354ff3d9c Mon Sep 17 00:00:00 2001 From: Michael Engel Date: Tue, 28 Jan 2025 09:32:40 +0100 Subject: [PATCH 13/24] Handle missing model in CLI parameters for llama-run (#11399) The HTTP client in llama-run only prints an error in case the download of a resource failed. If the model name in the CLI parameter list is missing, this causes the application to crash. In order to prevent this, a check for the required model parameter has been added and errors for resource downloads get propagated to the caller. Signed-off-by: Michael Engel --- examples/run/run.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 8a0db74b6..5980a786f 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -181,6 +181,10 @@ class Opt { } } + if (model_.empty()){ + return 1; + } + return 0; } @@ -350,7 +354,11 @@ class HttpClient { data.file_size = set_resume_point(output_file_partial); set_progress_options(progress, data); set_headers(headers); - perform(url); + CURLcode res = perform(url); + if (res != CURLE_OK){ + printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res)); + return 1; + } if (!output_file.empty()) { std::filesystem::rename(output_file_partial, output_file); } @@ -415,16 +423,12 @@ class HttpClient { } } - void perform(const std::string & url) { - CURLcode res; + CURLcode perform(const std::string & url) { curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); - res = curl_easy_perform(curl); - if (res != CURLE_OK) { - printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); - } + return curl_easy_perform(curl); } static std::string human_readable_time(double seconds) { From 6e84b0ab8e10b8f6f99a32855f976ebcd35b0353 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Tue, 28 Jan 2025 15:26:58 +0530 Subject: [PATCH 14/24] SYCL : SOFTMAX F16 mask support and other fixes (#11261) Implemented ggml_sycl_op_soft_max() F16 src1(mask) support for which a pragma deprecation warning was added during #5021. To do this, had to decouple it from ggml_sycl_op_flatten which always considered src1 to be of fp32 type(many OP functions are dependent on it). * SYCL: SOFTMAX F16 mask support and other fixes * test-backend-ops: Add F16 mask test cases --- ggml/src/ggml-sycl/ggml-sycl.cpp | 6 +--- ggml/src/ggml-sycl/softmax.cpp | 56 +++++++++++++++++++------------- ggml/src/ggml-sycl/softmax.hpp | 6 +--- tests/test-backend-ops.cpp | 45 ++++++++++++++++--------- 4 files changed, 64 insertions(+), 49 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ed4d8bb8b..2984ed82e 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3878,10 +3878,6 @@ static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf); } -static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_soft_max); -} - static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope); @@ -4090,7 +4086,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_diag_mask_inf(ctx, dst); break; case GGML_OP_SOFT_MAX: - ggml_sycl_soft_max(ctx, dst); + ggml_sycl_op_soft_max(ctx, dst); break; case GGML_OP_ROPE: ggml_sycl_rope(ctx, dst); diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index a9b3fce0d..563e0655f 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -1,7 +1,7 @@ -#include "norm.hpp" +#include "softmax.hpp" -template -static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par, +template +static void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) { const int ncols = ncols_template == 0 ? ncols_par : ncols_template; @@ -29,7 +29,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const slope = sycl::pow(base, float(exp)); } - float *vals = vals_smem ? buf + std::max(nwarps, WARP_SIZE) : dst + rowx * ncols; + float *vals = vals_smem ? buf + sycl::max(nwarps, WARP_SIZE) : dst + rowx * ncols; float max_val = -INFINITY; for (int col0 = 0; col0 < ncols; col0 += block_size) { @@ -42,7 +42,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const const int ix = rowx*ncols + col; const int iy = rowy*ncols + col; - const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f); + const float val = x[ix]*scale + (mask ? slope*static_cast(mask[iy]) : 0.0f); vals[col] = val; max_val = sycl::max(max_val, val); @@ -65,7 +65,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const item_ct1.barrier(sycl::access::fence_space::local_space); max_val = buf[lane_id]; for (size_t i = 1; i < nreduce; i += 1) { - max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]); + max_val = sycl::max(max_val, buf[lane_id + i * WARP_SIZE]); } max_val = warp_reduce_max(max_val, item_ct1); } @@ -122,8 +122,8 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const } } -template -static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par, +template +static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims, const size_t n_local_scratch, queue_ptr stream) { @@ -141,7 +141,8 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float * }); } -static void soft_max_f32_sycl(const float * x, const float * mask, +template +static void soft_max_f32_sycl(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, queue_ptr stream, int device) { @@ -223,22 +224,16 @@ static void soft_max_f32_sycl(const float * x, const float * mask, } } -void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); -#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support") -#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") - GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional + GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional - const int64_t ne00 = src0->ne[0]; - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t nrows_x = ggml_nrows(dst->src[0]); + const int64_t nrows_y = dst->src[0]->ne[1]; float scale = 1.0f; float max_bias = 0.0f; @@ -246,6 +241,21 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *s memcpy(&scale, dst->op_params + 0, sizeof(float)); memcpy(&max_bias, dst->op_params + 1, sizeof(float)); - soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, - nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + ggml_sycl_set_device(ctx.device); + dpct::queue_ptr main_stream = ctx.stream(); + + if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) { + const sycl::half * src1_dd = static_cast(dst->src[1]->data); + soft_max_f32_sycl(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, + main_stream, ctx.device); + } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) { + const float * src1_dd = static_cast(dst->src[1]->data); + soft_max_f32_sycl(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); + } else { + /* mask unavailable */ + soft_max_f32_sycl(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); + } } diff --git a/ggml/src/ggml-sycl/softmax.hpp b/ggml/src/ggml-sycl/softmax.hpp index bdb8f712e..2cf8582ec 100644 --- a/ggml/src/ggml-sycl/softmax.hpp +++ b/ggml/src/ggml-sycl/softmax.hpp @@ -15,10 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream); +void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst); #endif // GGML_SYCL_SOFTMAX_HPP diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 468016403..4c5c4dd9c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2347,11 +2347,12 @@ struct test_soft_max : public test_case { const ggml_type type; const std::array ne; const bool mask; + const ggml_type m_prec; const float scale; const float max_bias; std::string vars() override { - return VARS_TO_STR5(type, ne, mask, scale, max_bias); + return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias); } // the 1024 test with bias occasionally fails: @@ -2363,9 +2364,10 @@ struct test_soft_max : public test_case { test_soft_max(ggml_type type = GGML_TYPE_F32, std::array ne = {10, 5, 4, 3}, bool mask = false, + ggml_type m_prec = GGML_TYPE_F32, float scale = 1.0f, float max_bias = 0.0f) - : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {} + : type(type), ne(ne), mask(mask), m_prec(m_prec), scale(scale), max_bias(max_bias) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); @@ -2374,7 +2376,7 @@ struct test_soft_max : public test_case { ggml_tensor * mask = nullptr; if (this->mask) { - mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]); + mask = ggml_new_tensor_2d(ctx, m_prec, ne[0], ne[1]); ggml_set_name(mask, "mask"); } @@ -4150,17 +4152,28 @@ static std::vector> make_test_cases_eval() { for (float scale : {1.0f, 0.1f}) { for (int64_t ne0 : {16, 1024}) { for (int64_t ne1 : {16, 1024}) { - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias)); + if (mask) { + for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) { + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, m_prec, scale, max_bias)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, m_prec, scale, max_bias)); + } + } else { + /* The precision of mask here doesn't matter as boolean mask is false */ + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias)); + } } } } } } - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, GGML_TYPE_F32, 0.1f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 8.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 8.0f)); for (float max_bias : {0.0f, 8.0f}) { for (float scale : {1.0f, 0.1f}) { @@ -4296,13 +4309,13 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f)); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f)); test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1})); test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); From f643120bad8ab3a753daa64aaac8288ee5800e06 Mon Sep 17 00:00:00 2001 From: Nuno Date: Tue, 28 Jan 2025 11:42:32 +0100 Subject: [PATCH 15/24] docker: add perplexity and bench commands to full image (#11438) Signed-off-by: rare-magma --- .devops/tools.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.devops/tools.sh b/.devops/tools.sh index 9a86e6ea0..41a6b1e55 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,9 +13,13 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then exec ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then exec ./llama-cli "$@" +elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then + exec ./llama-bench "$@" +elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then + exec ./llama-perplexity "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." - for i in `ls $1/$2/ggml-model-f16.bin*`; do + for i in $(ls $1/$2/ggml-model-f16.bin*); do if [ -f "${i/f16/q4_0}" ]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else @@ -30,6 +34,10 @@ else echo "Available commands: " echo " --run (-r): Run a model previously converted into ggml" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" + echo " --bench (-b): Benchmark the performance of the inference for various parameters." + echo " ex: -m model.gguf" + echo " --perplexity (-p): Measure the perplexity of a model over a given text." + echo " ex: -m model.gguf -f file.txt" echo " --convert (-c): Convert a llama model into ggml" echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" From 4bf3119d61c1de5660025fd5a611effe503e3d2b Mon Sep 17 00:00:00 2001 From: someone13574 <81528246+someone13574@users.noreply.github.com> Date: Tue, 28 Jan 2025 09:15:34 -0500 Subject: [PATCH 16/24] cmake : don't fail on `GGML_CPU=OFF` (#11457) --- ggml/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8d2b948fb..566709135 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -308,7 +308,7 @@ if (GGML_CPU_ALL_VARIANTS) # MSVC doesn't support AMX ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) endif() -else () +elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") endif() From d7d1eccacccaa698c9232014b96a82b359595d6e Mon Sep 17 00:00:00 2001 From: Nuno Date: Tue, 28 Jan 2025 15:17:25 +0100 Subject: [PATCH 17/24] docker: allow installing pip packages system-wide (#11437) Signed-off-by: rare-magma --- .devops/vulkan.Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index b5bd3b6d2..eabf832f8 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -55,8 +55,9 @@ RUN apt-get update \ git \ python3 \ python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ + python3-wheel \ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ && apt autoremove -y \ && apt clean -y \ && rm -rf /tmp/* /var/tmp/* \ From 7fee2889e6565830631fbe76d47ef85cf8fd946a Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Tue, 28 Jan 2025 15:45:41 +0100 Subject: [PATCH 18/24] Add github protocol pulling and http:// (#11465) As pulling protocols to llama-run Signed-off-by: Eric Curtin --- examples/run/run.cpp | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 5980a786f..40f2bcb00 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -673,6 +673,40 @@ class LlamaData { return download(blob_url, bn, true, headers); } + int github_dl(const std::string & model, const std::string & bn) { + std::string repository = model; + std::string branch = "main"; + size_t at_pos = model.find('@'); + if (at_pos != std::string::npos) { + repository = model.substr(0, at_pos); + branch = model.substr(at_pos + 1); + } + + std::vector repo_parts; + size_t start = 0; + for (size_t end = 0; (end = repository.find('/', start)) != std::string::npos; start = end + 1) { + repo_parts.push_back(repository.substr(start, end - start)); + } + + repo_parts.push_back(repository.substr(start)); + if (repo_parts.size() < 3) { + printe("Invalid GitHub repository format\n"); + return 1; + } + + const std::string org = repo_parts[0]; + const std::string project = repo_parts[1]; + std::string project_path = repo_parts[2]; + for (size_t i = 3; i < repo_parts.size(); ++i) { + project_path += "/" + repo_parts[i]; + } + + const std::string url = + "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch + "/" + project_path; + + return download(url, bn, true); + } + std::string basename(const std::string & path) { const size_t pos = path.find_last_of("/\\"); if (pos == std::string::npos) { @@ -707,8 +741,12 @@ class LlamaData { } else if (string_starts_with(model_, "hf.co/")) { rm_until_substring(model_, "hf.co/"); ret = huggingface_dl(model_, bn); - } else if (string_starts_with(model_, "https://")) { + } else if (string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) { ret = download(model_, bn, true); + } else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) { + rm_until_substring(model_, "github://"); + rm_until_substring(model_, "github:"); + ret = github_dl(model_, bn); } else { // ollama:// or nothing rm_until_substring(model_, "://"); ret = ollama_dl(model_, bn); From cae9fb4361138b937464524eed907328731b81f6 Mon Sep 17 00:00:00 2001 From: Nikita Sarychev <42014488+sARY77@users.noreply.github.com> Date: Tue, 28 Jan 2025 07:42:20 -0800 Subject: [PATCH 19/24] HIP: Only call rocblas_initialize on rocblas versions with the multiple instantation bug (#11080) This disables the workaround on rocblas fixed versions (>=4.0.0) to eliminate the runtime cost and unnecessary VRAM allocation of loading all tensile objects. --- ggml/src/ggml-cuda/ggml-cuda.cu | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 402f37e85..de3f9c2ca 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -172,8 +173,25 @@ static ggml_cuda_device_info ggml_cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 - rocblas_initialize(); - CUDA_CHECK(cudaDeviceSynchronize()); + { + int major_version = 0; + size_t version_length = 0; + if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) { + std::string version(version_length, '\0'); + if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) { + version.resize(::strlen(version.c_str())); + int parsed_value = 0; + if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) { + major_version = parsed_value; + } + } + } + if (major_version < 4) { + GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n"); + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); + } + } #endif ggml_cuda_device_info info = {}; From be5ef7963fcf14a9c77c963fdd3f7b606eacb498 Mon Sep 17 00:00:00 2001 From: uvos Date: Tue, 28 Jan 2025 23:06:32 +0100 Subject: [PATCH 20/24] HIP: Supress transformation warning in softmax.cu loops with bounds not known at compile time can not be unrolled. when ncols_template == 0, the bounds of the loop are not constexpr, thus llvm cant unroll the loops here. --- ggml/src/ggml-cuda/softmax.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index 9aa4b8489..da377200e 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -13,6 +13,12 @@ __device__ float __forceinline__ t2f32(half val) { return __half2float(val); } +// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled. +// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpass-failed" +#endif template static __global__ void soft_max_f32( const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, @@ -118,6 +124,9 @@ static __global__ void soft_max_f32( dst[col] = vals[col] * inv_sum; } } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif static __global__ void soft_max_back_f32( const float * grad, const float * dstf, float * dst, const int ncols, const float scale) { From d0c08040b6c8bebeade7b8d5764df6cf901678d5 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 29 Jan 2025 00:02:56 +0100 Subject: [PATCH 21/24] ci : fix build CPU arm64 (#11472) * ci : fix build CPU arm64 * failed, trying ubuntu 22 * vulkan: ubuntu 24 * vulkan : jammy --> noble --- .devops/vulkan.Dockerfile | 4 ++-- .github/workflows/docker.yml | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index eabf832f8..9064f3838 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,4 @@ -ARG UBUNTU_VERSION=22.04 +ARG UBUNTU_VERSION=24.04 FROM ubuntu:$UBUNTU_VERSION AS build @@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget # Install Vulkan SDK and cURL RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \ apt update -y && \ apt-get install -y vulkan-sdk libcurl4-openssl-dev curl diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6bf22eb66..6955a7dc8 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -28,7 +28,7 @@ jobs: push_to_registry: name: Push Docker image to Docker Hub - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: COMMIT_SHA: ${{ github.sha }} strategy: @@ -36,8 +36,7 @@ jobs: matrix: config: # Multi-stage build - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} From cf8cc856d7d02165bd08593b4757e1256a62d501 Mon Sep 17 00:00:00 2001 From: peidaqi Date: Tue, 28 Jan 2025 16:03:42 -0700 Subject: [PATCH 22/24] server : Fixed wrong function name in llamacpp server unit test (#11473) The test_completion_stream_with_openai_library() function is actually with stream=False by default, and test_completion_with_openai_library() with stream=True --- examples/server/tests/unit/test_completion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index c1fc12462..0ed5b99be 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -87,7 +87,7 @@ def test_completion_stream_vs_non_stream(): assert content_stream == res_non_stream.body["content"] -def test_completion_stream_with_openai_library(): +def test_completion_with_openai_library(): global server server.start() client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") @@ -102,7 +102,7 @@ def test_completion_stream_with_openai_library(): assert match_regex("(going|bed)+", res.choices[0].text) -def test_completion_with_openai_library(): +def test_completion_stream_with_openai_library(): global server server.start() client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") From 794fe23f29fb40104975c91fe19f23798f7c726e Mon Sep 17 00:00:00 2001 From: Emreerdog <34742675+Emreerdog@users.noreply.github.com> Date: Wed, 29 Jan 2025 02:22:06 +0300 Subject: [PATCH 23/24] cmake: add hints for locating ggml on Windows using Llama find-package (#11466) --- cmake/llama-config.cmake.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 40ade96e5..90cbec5b6 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -9,7 +9,7 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") -find_package(ggml REQUIRED) +find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake) find_library(llama_LIBRARY llama REQUIRED From 325afb370a1a7b32b5fe46a749bc840c66db9765 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Wed, 29 Jan 2025 12:07:21 +0800 Subject: [PATCH 24/24] llama: fix missing k_cache store for rwkv6qwen2 (#11445) Signed-off-by: Molly Sophia --- src/llama.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 12e8f41fc..192b20a27 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7700,17 +7700,13 @@ struct llm_build_context { 1 ); + struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); ggml_build_forward_expand( gf, ggml_cpy( ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) + ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0), + ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) ) );