diff --git a/.gitignore b/.gitignore index b84459b92..62b6b8b1a 100644 --- a/.gitignore +++ b/.gitignore @@ -23,11 +23,13 @@ .clang-tidy .vs/ .vscode/ +.idea/ lcov-report/ gcovr-report/ build* +cmake-build-* out/ tmp/ diff --git a/CMakeLists.txt b/CMakeLists.txt index f8c7f9978..2a922fdb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,17 +112,14 @@ option(LLAMA_MPI "llama: use MPI" option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) +option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) - # add perf arguments option(LLAMA_PERF "llama: enable perf" OFF) -if (LLAMA_PERF) - add_definitions(-DGGML_PERF) -endif() # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) @@ -130,6 +127,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) # # Compile flags # + if (LLAMA_SYCL) set(CMAKE_CXX_STANDARD 17) else() @@ -140,6 +138,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) include(CheckCXXCompilerFlag) @@ -151,17 +150,17 @@ endif() if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) add_compile_options(-fsanitize=thread) - link_libraries(-fsanitize=thread) + link_libraries (-fsanitize=thread) endif() if (LLAMA_SANITIZE_ADDRESS) add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries(-fsanitize=address) + link_libraries (-fsanitize=address) endif() if (LLAMA_SANITIZE_UNDEFINED) add_compile_options(-fsanitize=undefined) - link_libraries(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) endif() endif() @@ -298,14 +297,17 @@ if (LLAMA_BLAS) endif() message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + add_compile_options(${BLAS_LINKER_FLAGS}) + add_compile_definitions(GGML_USE_OPENBLAS) + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -330,9 +332,6 @@ if (LLAMA_CUBLAS) set(GGML_SOURCES_CUDA ggml-cuda.cu) add_compile_definitions(GGML_USE_CUBLAS) -# if (LLAMA_CUDA_CUBLAS) -# add_compile_definitions(GGML_CUDA_CUBLAS) -# endif() if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() @@ -387,15 +386,20 @@ if (LLAMA_MPI) find_package(MPI) if (MPI_C_FOUND) message(STATUS "MPI found") + set(GGML_HEADERS_MPI ggml-mpi.h) - set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) + set(GGML_SOURCES_MPI ggml-mpi.c) + add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) + if (NOT MSVC) add_compile_options(-Wno-cast-qual) endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) + # Even if you're only using the C header, C++ programs may bring in MPI # C++ functions, so more linkage is needed if (MPI_CXX_FOUND) @@ -427,31 +431,28 @@ if (LLAMA_VULKAN) if (Vulkan_FOUND) message(STATUS "Vulkan found") - add_library(ggml-vulkan OBJECT ggml-vulkan.cpp ggml-vulkan.h) - if (BUILD_SHARED_LIBS) - set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON) - endif() - target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) + set(GGML_HEADERS_VULKAN ggml-vulkan.h) + set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) add_compile_definitions(GGML_USE_VULKAN) if (LLAMA_VULKAN_CHECK_RESULTS) - target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_CHECK_RESULTS) + add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) endif() if (LLAMA_VULKAN_DEBUG) - target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_DEBUG) + add_compile_definitions(GGML_VULKAN_DEBUG) endif() if (LLAMA_VULKAN_VALIDATE) - target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_VALIDATE) + add_compile_definitions(GGML_VULKAN_VALIDATE) endif() if (LLAMA_VULKAN_RUN_TESTS) - target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_RUN_TESTS) + add_compile_definitions(GGML_VULKAN_RUN_TESTS) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan) else() message(WARNING "Vulkan not found") endif() @@ -463,43 +464,45 @@ if (LLAMA_HIPBLAS) if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") endif() - find_package(hip) - find_package(hipblas) - find_package(rocblas) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) - if (${hipblas_FOUND} AND ${hip_FOUND}) - message(STATUS "HIP and hipBLAS found") - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) - if (LLAMA_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) - endif() - add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) - if (BUILD_SHARED_LIBS) - set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) - endif() - if (LLAMA_CUDA_FORCE_DMMV) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) - endif() - if (LLAMA_CUDA_FORCE_MMQ) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ) - endif() - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) + message(STATUS "HIP and hipBLAS found") - if (LLAMA_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) - else() - message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") + set(GGML_HEADERS_ROCM ggml-cuda.h) + set(GGML_SOURCES_ROCM ggml-cuda.cu) + + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + + if (LLAMA_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) endif() + + if (LLAMA_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + + if (LLAMA_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) endif() if (LLAMA_SYCL) @@ -509,10 +512,14 @@ if (LLAMA_SYCL) #todo: AOT find_package(IntelSYCL REQUIRED) + + message(STATUS "SYCL found") + + add_compile_definitions(GML_USE_SYCL) + if (LLAMA_SYCL_F16) add_compile_definitions(GGML_SYCL_F16) endif() - add_compile_definitions(GGML_USE_SYCL) add_compile_options(-I./) #include DPCT add_compile_options(-I/${SYCL_INCLUDE_DIR}) @@ -521,7 +528,7 @@ if (LLAMA_SYCL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") - set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h) + set(GGML_HEADERS_SYCL ggml-sycl.h) set(GGML_SOURCES_SYCL ggml-sycl.cpp) if (WIN32) @@ -540,61 +547,61 @@ if (LLAMA_KOMPUTE) endif() function(compile_shader) - set(options) - set(oneValueArgs) - set(multiValueArgs SOURCES) - cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - foreach(source ${compile_shader_SOURCES}) - get_filename_component(filename ${source} NAME) - set(spv_file ${filename}.spv) - add_custom_command( - OUTPUT ${spv_file} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp - COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} - COMMENT "Compiling ${source} to ${spv_file}" - ) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES) + cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(source ${compile_shader_SOURCES}) + get_filename_component(filename ${source} NAME) + set(spv_file ${filename}.spv) + add_custom_command( + OUTPUT ${spv_file} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} + COMMENT "Compiling ${source} to ${spv_file}" + ) - get_filename_component(RAW_FILE_NAME ${spv_file} NAME) - set(FILE_NAME "shader${RAW_FILE_NAME}") - string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) - string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) - string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") - set(OUTPUT_HEADER_FILE "${HEADER_FILE}") - message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") - if(CMAKE_GENERATOR MATCHES "Visual Studio") - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" - ) - else() - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" - ) - endif() - endforeach() + get_filename_component(RAW_FILE_NAME ${spv_file} NAME) + set(FILE_NAME "shader${RAW_FILE_NAME}") + string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) + string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) + string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") + set(OUTPUT_HEADER_FILE "${HEADER_FILE}") + message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" + ) + else() + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" + ) + endif() + endforeach() endfunction() if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") @@ -604,66 +611,66 @@ if (LLAMA_KOMPUTE) # Compile our shaders compile_shader(SOURCES - kompute-shaders/op_scale.comp - kompute-shaders/op_scale_8.comp - kompute-shaders/op_add.comp - kompute-shaders/op_addrow.comp - kompute-shaders/op_mul.comp - kompute-shaders/op_silu.comp - kompute-shaders/op_relu.comp - kompute-shaders/op_gelu.comp - kompute-shaders/op_softmax.comp - kompute-shaders/op_norm.comp - kompute-shaders/op_rmsnorm.comp - kompute-shaders/op_diagmask.comp - kompute-shaders/op_mul_mat_mat_f32.comp - kompute-shaders/op_mul_mat_f16.comp - kompute-shaders/op_mul_mat_q8_0.comp - kompute-shaders/op_mul_mat_q4_0.comp - kompute-shaders/op_mul_mat_q4_1.comp - kompute-shaders/op_mul_mat_q6_k.comp - kompute-shaders/op_getrows_f16.comp - kompute-shaders/op_getrows_q4_0.comp - kompute-shaders/op_getrows_q4_1.comp - kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_f16.comp - kompute-shaders/op_rope_f32.comp - kompute-shaders/op_cpy_f16_f16.comp - kompute-shaders/op_cpy_f16_f32.comp - kompute-shaders/op_cpy_f32_f16.comp - kompute-shaders/op_cpy_f32_f32.comp + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp ) # Create a custom target for our generated shaders add_custom_target(generated_shaders DEPENDS - shaderop_scale.h - shaderop_scale_8.h - shaderop_add.h - shaderop_addrow.h - shaderop_mul.h - shaderop_silu.h - shaderop_relu.h - shaderop_gelu.h - shaderop_softmax.h - shaderop_norm.h - shaderop_rmsnorm.h - shaderop_diagmask.h - shaderop_mul_mat_mat_f32.h - shaderop_mul_mat_f16.h - shaderop_mul_mat_q8_0.h - shaderop_mul_mat_q4_0.h - shaderop_mul_mat_q4_1.h - shaderop_mul_mat_q6_k.h - shaderop_getrows_f16.h - shaderop_getrows_q4_0.h - shaderop_getrows_q4_1.h - shaderop_getrows_q6_k.h - shaderop_rope_f16.h - shaderop_rope_f32.h - shaderop_cpy_f16_f16.h - shaderop_cpy_f16_f32.h - shaderop_cpy_f32_f16.h - shaderop_cpy_f32_f32.h + shaderop_scale.h + shaderop_scale_8.h + shaderop_add.h + shaderop_addrow.h + shaderop_mul.h + shaderop_silu.h + shaderop_relu.h + shaderop_gelu.h + shaderop_softmax.h + shaderop_norm.h + shaderop_rmsnorm.h + shaderop_diagmask.h + shaderop_mul_mat_mat_f32.h + shaderop_mul_mat_f16.h + shaderop_mul_mat_q8_0.h + shaderop_mul_mat_q4_0.h + shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q6_k.h + shaderop_getrows_f16.h + shaderop_getrows_q4_0.h + shaderop_getrows_q4_1.h + shaderop_getrows_q6_k.h + shaderop_rope_f16.h + shaderop_rope_f32.h + shaderop_cpy_f16_f16.h + shaderop_cpy_f16_f32.h + shaderop_cpy_f32_f16.h + shaderop_cpy_f32_f32.h ) # Create a custom command that depends on the generated_shaders @@ -676,8 +683,10 @@ if (LLAMA_KOMPUTE) # Add the stamp to the main sources to ensure dependency tracking set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) - set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + add_compile_definitions(GGML_USE_KOMPUTE) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) else() @@ -685,6 +694,18 @@ if (LLAMA_KOMPUTE) endif() endif() +if (LLAMA_CPU_HBM) + find_library(memkind memkind REQUIRED) + + add_compile_definitions(GGML_USE_CPU_HBM) + + target_link_libraries(ggml PUBLIC memkind) +endif() + +if (LLAMA_PERF) + add_compile_definitions(GGML_PERF) +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -821,6 +842,7 @@ execute_process( ERROR_VARIABLE output OUTPUT_QUIET ) + if (output MATCHES "dyld-1015\.7") add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) endif() @@ -830,10 +852,10 @@ endif() # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") + set(CMAKE_GENERATOR_PLATFORM_LWR "") endif () if (NOT MSVC) @@ -1027,11 +1049,6 @@ endif() # ggml -if (GGML_USE_CPU_HBM) - add_definitions(-DGGML_USE_CPU_HBM) - find_library(memkind memkind REQUIRED) -endif() - add_library(ggml OBJECT ggml.c ggml.h @@ -1048,16 +1065,17 @@ add_library(ggml OBJECT ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) -target_compile_features(ggml PUBLIC c_std_11) # don't bump +target_compile_features (ggml PUBLIC c_std_11) # don't bump + target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) -if (GGML_USE_CPU_HBM) - target_link_libraries(ggml PUBLIC memkind) -endif() add_library(ggml_static STATIC $) + if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(ggml_shared SHARED $) @@ -1073,7 +1091,8 @@ add_library(llama ) target_include_directories(llama PUBLIC .) -target_compile_features(llama PUBLIC cxx_std_11) # don't bump +target_compile_features (llama PUBLIC cxx_std_11) # don't bump + target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} @@ -1124,7 +1143,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama) set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" - "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" + "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/common/common.cpp b/common/common.cpp index c5e83cc2a..3a92d3797 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -341,7 +341,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = sampler_types_from_names(sampler_names); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); } else if (arg == "--sampling-seq") { if (++i >= argc) { invalid_param = true; @@ -964,7 +964,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str()); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); + printf(" (default: %s)\n", sampler_type_names.c_str()); printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); @@ -1133,34 +1134,50 @@ std::vector string_split(std::string input, char separator) { return parts; } -std::vector sampler_types_from_names(const std::vector & names) { +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} + }; + // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map sampler_name_map { - {"top_k", llama_sampler_type::TOP_K}, + std::unordered_map sampler_alt_name_map { {"top-k", llama_sampler_type::TOP_K}, - {"top_p", llama_sampler_type::TOP_P}, {"top-p", llama_sampler_type::TOP_P}, {"nucleus", llama_sampler_type::TOP_P}, - {"typical_p", llama_sampler_type::TYPICAL_P}, {"typical-p", llama_sampler_type::TYPICAL_P}, {"typical", llama_sampler_type::TYPICAL_P}, - {"min_p", llama_sampler_type::MIN_P}, {"min-p", llama_sampler_type::MIN_P}, - {"tfs_z", llama_sampler_type::TFS_Z}, {"tfs-z", llama_sampler_type::TFS_Z}, {"tfs", llama_sampler_type::TFS_Z}, - {"temp", llama_sampler_type::TEMP}, - {"temperature", llama_sampler_type::TEMP} + {"temp", llama_sampler_type::TEMPERATURE} }; std::vector sampler_types; sampler_types.reserve(names.size()); - for (const auto& name : names) { - const auto sampler_item = sampler_name_map.find(name); - if (sampler_item != sampler_name_map.end()) { + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { sampler_types.push_back(sampler_item->second); } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } + } } return sampler_types; } @@ -1172,7 +1189,7 @@ std::vector sampler_types_from_chars(const std::string & nam {'y', llama_sampler_type::TYPICAL_P}, {'m', llama_sampler_type::MIN_P}, {'f', llama_sampler_type::TFS_Z}, - {'t', llama_sampler_type::TEMP} + {'t', llama_sampler_type::TEMPERATURE} }; std::vector sampler_types; @@ -1188,12 +1205,12 @@ std::vector sampler_types_from_chars(const std::string & nam std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { switch (sampler_type) { - case llama_sampler_type::TOP_K: return "top_k"; - case llama_sampler_type::TFS_Z: return "tfs_z"; - case llama_sampler_type::TYPICAL_P: return "typical_p"; - case llama_sampler_type::TOP_P: return "top_p"; - case llama_sampler_type::MIN_P: return "min_p"; - case llama_sampler_type::TEMP: return "temp"; + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; default : return ""; } } diff --git a/common/common.h b/common/common.h index 74c136995..935771d44 100644 --- a/common/common.h +++ b/common/common.h @@ -165,7 +165,7 @@ void process_escapes(std::string& input); // String utils // -std::vector sampler_types_from_names(const std::vector & names); +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); std::vector sampler_types_from_chars(const std::string & names_string); std::vector string_split(std::string input, char separator); std::string sampler_type_to_name_string(llama_sampler_type sampler_type); diff --git a/common/sampling.cpp b/common/sampling.cpp index a001750da..53013138a 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -139,7 +139,7 @@ static void sampler_queue( case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case llama_sampler_type::TEMP: + case llama_sampler_type::TEMPERATURE: if (dynatemp_range > 0) { float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range); diff --git a/common/sampling.h b/common/sampling.h index 2bd6a75d2..e1279a894 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -10,12 +10,12 @@ // sampler types enum class llama_sampler_type : char { - TOP_K = 'k', - TOP_P = 'p', - MIN_P = 'm', - TFS_Z = 'f', - TYPICAL_P = 'y', - TEMP = 't' + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMPERATURE = 't' }; // sampling parameters @@ -45,7 +45,7 @@ typedef struct llama_sampling_params { llama_sampler_type::TYPICAL_P, llama_sampler_type::TOP_P, llama_sampler_type::MIN_P, - llama_sampler_type::TEMP + llama_sampler_type::TEMPERATURE }; std::string grammar; // optional BNF-like grammar to constrain sampling diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ed310a0e..4cb65a07b 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -315,7 +315,6 @@ static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_thre float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); - free(image_embd); return false; } diff --git a/examples/server/README.md b/examples/server/README.md index 8e141d22d..249368749 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -204,6 +204,8 @@ node index.js `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values) + ### Result JSON - Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 912c750cc..a0b46970b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -436,10 +436,6 @@ struct llama_server_context default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); - - // empty system prompt - system_prompt = ""; - system_tokens.clear(); } std::vector tokenize(const json & json_prompt, bool add_bos) const @@ -676,6 +672,24 @@ struct llama_server_context } } + const auto &samplers_sequence = data.find("samplers"); + if (samplers_sequence != data.end() && samplers_sequence->is_array()) + { + std::vector sampler_names; + for (const auto &sampler_name : *samplers_sequence) + { + if (sampler_name.is_string()) + { + sampler_names.emplace_back(sampler_name); + } + } + slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); + } + else + { + slot->sparams.samplers_sequence = default_sparams.samplers_sequence; + } + if (multimodal) { const auto &images_data = data.find("image_data"); @@ -765,27 +779,30 @@ struct llama_server_context } void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - - llama_batch_clear(batch); - kv_cache_clear(); + system_tokens.clear(); - for (int i = 0; i < (int) system_tokens.size(); ++i) - { - llama_batch_add(batch, system_tokens[i], i, { 0 }, false); - } + if (!system_prompt.empty()) { + system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - if (llama_decode(ctx, batch) != 0) - { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return; - } + llama_batch_clear(batch); - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) - { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + for (int i = 0; i < (int)system_tokens.size(); ++i) + { + llama_batch_add(batch, system_tokens[i], i, { 0 }, false); + } + + if (llama_decode(ctx, batch) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + } } LOG_TEE("system prompt updated\n"); @@ -807,10 +824,8 @@ struct llama_server_context name_user = sys_props.value("anti_prompt", ""); name_assistant = sys_props.value("assistant_name", ""); - if (slots.size() > 0) - { - notify_system_prompt_changed(); - } + + notify_system_prompt_changed(); } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, @@ -1029,6 +1044,12 @@ struct llama_server_context const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); + std::vector samplers_sequence; + for (const auto &sampler_type : slot.sparams.samplers_sequence) + { + samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); + } + return json { {"n_ctx", slot.n_ctx}, {"model", params.model_alias}, @@ -1059,6 +1080,7 @@ struct llama_server_context {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, {"grammar", slot.sparams.grammar}, + {"samplers", samplers_sequence} }; } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 1fad24fd1..4a30414df 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1091,7 +1091,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { } } -void ggml_vk_instance_init() { +static void ggml_vk_instance_init() { if (vk_instance_initialized) { return; } @@ -1150,7 +1150,7 @@ void ggml_vk_instance_init() { vk_instance_initialized = true; } -void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { +static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { GGML_ASSERT(idx < vk_instance.device_indices.size()); size_t dev_num = vk_instance.device_indices[idx]; #ifdef GGML_VULKAN_DEBUG @@ -4556,13 +4556,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { } } -GGML_CALL int ggml_vk_get_device_count() { +GGML_CALL static int ggml_vk_get_device_count() { ggml_vk_instance_init(); return vk_instance.device_indices.size(); } -GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) { +GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { ggml_vk_instance_init(); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); @@ -4580,7 +4580,7 @@ void ggml_vk_init_cpu_assist() { std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl; - for (size_t i = 0; i < ggml_vk_get_device_count(); i++) { + for (int i = 0; i < ggml_vk_get_device_count(); i++) { ggml_vk_print_gpu_info(i); } // Initialize the first backend to make sure CPU matrix multiplications can be offloaded. @@ -5267,7 +5267,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip } GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { - GGML_ASSERT(device < vk_instance.device_indices.size()); + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; diff --git a/llama.cpp b/llama.cpp index dee685c19..199669517 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10893,7 +10893,7 @@ static int llama_apply_lora_from_file_internal( { LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", __func__, ftype); - return false; + return 1; } } diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh new file mode 100755 index 000000000..331c4b9ce --- /dev/null +++ b/scripts/compare-commits.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ $# -lt 2 ]; then + echo "usage: ./scripts/compare-commits.sh [additional llama-bench arguments]" + exit 1 +fi + +set -e +set -x + +bench_args="${@:3}" + +rm -f llama-bench.sqlite + +backend="cpu" + +if [[ "$OSTYPE" == "darwin"* ]]; then + backend="metal" +elif command -v nvcc &> /dev/null; then + backend="cuda" +fi + +make_opts="" + +if [[ "$backend" == "cuda" ]]; then + make_opts="LLAMA_CUBLAS=1" +fi + +git checkout $1 +make clean && make -j32 $make_opts llama-bench +./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite + +git checkout $2 +make clean && make -j32 $make_opts llama-bench +./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite + +./scripts/compare-llama-bench.py -b $1 -c $2