Improve cuBLAS performance by dequantizing on the GPU (#1065)
This commit is contained in:
parent
834695fe3a
commit
02d6988121
5 changed files with 221 additions and 41 deletions
|
@ -110,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE)
|
|||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_OPENBLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
|
@ -150,6 +151,10 @@ if (LLAMA_CUBLAS)
|
|||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "cuBLAS found")
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
|
||||
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
|
@ -241,21 +246,26 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
|||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(/arch:AVX512)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_definitions(__AVX512VBMI__)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_definitions(__AVX512VNNI__)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_options(/arch:AVX2)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
|
||||
elseif (LLAMA_AVX)
|
||||
add_compile_options(/arch:AVX)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_F16C)
|
||||
|
@ -292,7 +302,8 @@ endif()
|
|||
|
||||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
ggml.h)
|
||||
ggml.h
|
||||
${GGML_CUDA_SOURCES})
|
||||
|
||||
target_include_directories(ggml PUBLIC .)
|
||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||
|
@ -314,6 +325,14 @@ if (BUILD_SHARED_LIBS)
|
|||
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_SOURCES)
|
||||
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
||||
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
|
||||
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
|
||||
endif()
|
||||
|
||||
|
||||
#
|
||||
# programs, examples and tests
|
||||
#
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue