diff --git a/CMakeLists.txt b/CMakeLists.txt index 26cd0a494..13d862c4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,6 +378,7 @@ if (LLAMA_HIPBLAS) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) diff --git a/Makefile b/Makefile index 0843d86c6..59b4abeba 100644 --- a/Makefile +++ b/Makefile @@ -296,6 +296,7 @@ ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +ggml-cuda.o: CXXFLAGS += -DCC_TURING=1000000000 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) -x hip -c -o $@ $< endif # LLAMA_HIPBLAS diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 03ecdee7c..aa4a2e919 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -74,7 +74,9 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_TURING 1000000000 +#ifndef CC_TURING +#define CC_TURING 700 +#endif #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300