Cleaner CUDA/host flags separation

Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
This commit is contained in:
Vlad 2023-09-14 02:11:48 +03:00
parent f06ddb2e79
commit e6547afea1
2 changed files with 70 additions and 65 deletions

View file

@ -95,16 +95,19 @@ CXXV := $(shell $(CXX) --version | head -n 1)
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
OPT = -Ofast MK_CFLAGS += -Ofast
MK_HOST_CXXFLAGS += -Ofast
MK_CUDA_CXXFLAGS += -O3
else else
OPT = -O3 MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
endif endif
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993) # clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@ -233,7 +236,7 @@ ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
MK_CXXFLAGS += -march=native -mtune=native MK_HOST_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
@ -373,7 +376,7 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS_CUDA) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -442,28 +445,30 @@ k_quants.o: k_quants.c k_quants.h
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
COMMA := , # save CXXFLAGS before we add host-only options
CXXFLAGS_CUDA := $(CXXFLAGS) NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
CXXFLAGS_CUDA := $(subst -march=native -mtune=native,--compiler-options=-march=native$(COMMA)-mtune=native,$(CXXFLAGS_CUDA)) override CXXFLAGS += $(HOST_CXXFLAGS)
CXXFLAGS_CUDA := $(subst -Ofast,-O3,$(CXXFLAGS_CUDA))
# #
# Print build information # Print build information
# #
$(info I llama.cpp build info: ) $(info I llama.cpp build info: )
$(info I UNAME_S: $(UNAME_S)) $(info I UNAME_S: $(UNAME_S))
$(info I UNAME_P: $(UNAME_P)) $(info I UNAME_P: $(UNAME_P))
$(info I UNAME_M: $(UNAME_M)) $(info I UNAME_M: $(UNAME_M))
$(info I CFLAGS: $(CFLAGS)) $(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS)) $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I CC: $(CCV)) $(info I LDFLAGS: $(LDFLAGS))
$(info I CXX: $(CXXV)) $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info ) $(info )
# #

View file

@ -181,9 +181,9 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#endif // CUDART_VERSION >= 11 #endif // CUDART_VERSION >= 11
#if CUDART_VERSION >= 11100 #if CUDART_VERSION >= 11100
#define GGML_ASSUME(x) __builtin_assume(x) #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else #else
#define GGML_ASSUME(x) #define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11100 #endif // CUDART_VERSION >= 11100
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
@ -2141,10 +2141,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_0; const int kbx = k / QI4_0;
const int kqsx = k % QI4_0; const int kqsx = k % QI4_0;
@ -2235,10 +2235,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_1; const int kbx = k / QI4_1;
const int kqsx = k % QI4_1; const int kqsx = k % QI4_1;
@ -2327,10 +2327,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_0; const int kbx = k / QI5_0;
const int kqsx = k % QI5_0; const int kqsx = k % QI5_0;
@ -2441,10 +2441,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_1; const int kbx = k / QI5_1;
const int kqsx = k % QI5_1; const int kqsx = k % QI5_1;
@ -2547,10 +2547,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI8_0; const int kbx = k / QI8_0;
const int kqsx = k % QI8_0; const int kqsx = k % QI8_0;
@ -2638,10 +2638,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI2_K; const int kbx = k / QI2_K;
const int kqsx = k % QI2_K; const int kqsx = k % QI2_K;
@ -2759,10 +2759,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI3_K; const int kbx = k / QI3_K;
const int kqsx = k % QI3_K; const int kqsx = k % QI3_K;
@ -2977,10 +2977,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI4_K; // == 0 if QK_K == 256 const int kbx = k / QI4_K; // == 0 if QK_K == 256
const int kqsx = k % QI4_K; // == k if QK_K == 256 const int kqsx = k % QI4_K; // == k if QK_K == 256
@ -3158,10 +3158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI5_K; // == 0 if QK_K == 256 const int kbx = k / QI5_K; // == 0 if QK_K == 256
const int kqsx = k % QI5_K; // == k if QK_K == 256 const int kqsx = k % QI5_K; // == k if QK_K == 256
@ -3287,10 +3287,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
GGML_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset >= 0);
GGML_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_ASSUME(k >= 0); GGML_CUDA_ASSUME(k >= 0);
GGML_ASSUME(k < WARP_SIZE); GGML_CUDA_ASSUME(k < WARP_SIZE);
const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kbx = k / QI6_K; // == 0 if QK_K == 256
const int kqsx = k % QI6_K; // == k if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256