Merge branch 'ggerganov:master' into refine-ggml-backend-subsystem

This commit is contained in:
zhouwg 2024-06-01 09:02:30 +08:00 committed by GitHub
commit 72197aef96
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 54 additions and 24 deletions

View file

@ -571,6 +571,7 @@ ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)

View file

@ -396,7 +396,7 @@ On Mac and Linux, the homebrew package manager can be used via
``` ```
brew install llama.cpp brew install llama.cpp
``` ```
The formula is automatically updated with new `llama.cpp` releases. The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Metal Build ### Metal Build

View file

@ -2840,7 +2840,12 @@ def main() -> None:
hparams = Model.load_hparams(dir_model) hparams = Model.load_hparams(dir_model)
with torch.inference_mode(): with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0]) try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
logger.info("Set model parameters") logger.info("Set model parameters")

File diff suppressed because one or more lines are too long

View file

@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
const uint8_t * restrict q2 = x[i].qs; const uint8_t * restrict q2 = x[i].qs;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0); const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4); const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4); const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const uint8_t * restrict q3 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
// Set up scales // Set up scales
memcpy(aux, x[i].scales, 12); memcpy(aux, x[i].scales, 12);
__m128i scales128 = lsx_set_w( __m128i scales128 = lsx_set_w(
@ -6830,8 +6833,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
int is = 0; int is = 0;
__m256i xvbit; __m256i xvbit;
const uint8_t * restrict q3 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
for (int j = 0; j < QK_K/128; ++j) { for (int j = 0; j < QK_K/128; ++j) {
// load low 2 bits // load low 2 bits
@ -7404,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
*s = vec_extract(vsumf0, 0); *s = vec_extract(vsumf0, 0);
#elif defined __loongarch_asx #elif defined __loongarch_asx
GGML_UNUSED(kmask1);
GGML_UNUSED(kmask2);
GGML_UNUSED(kmask3);
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
@ -7416,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
memcpy(utmp, x[i].scales, 12); memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;
const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
@ -7455,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
__m256 vd = __lasx_xvreplfr2vr_s(d); __m256 vd = __lasx_xvreplfr2vr_s(d);
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
} }
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee)); acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
__m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0); __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
ft_union fi; ft_union fi;
fi.i = __lsx_vpickve2gr_w(acc_m, 0); fi.i = __lsx_vpickve2gr_w(acc_m, 0);
*s = hsum_float_8(acc) + fi.f ; *s = hsum_float_8(acc) + fi.f ;
#else #else
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];
@ -8002,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
*s = vec_extract(vsumf0, 0); *s = vec_extract(vsumf0, 0);
#elif defined __loongarch_asx #elif defined __loongarch_asx
GGML_UNUSED(kmask1);
GGML_UNUSED(kmask2);
GGML_UNUSED(kmask3);
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
const __m128i mzero = __lsx_vldi(0); const __m128i mzero = __lsx_vldi(0);
@ -8020,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
memcpy(utmp, x[i].scales, 12); memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;
const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
@ -8069,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
p16_1 = lasx_madd_h(scale_1, p16_1); p16_1 = lasx_madd_h(scale_1, p16_1);
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
} }
__m256 vd = __lasx_xvreplfr2vr_s(d); __m256 vd = __lasx_xvreplfr2vr_s(d);
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
} }
*s = hsum_float_8(acc) + summs; *s = hsum_float_8(acc) + summs;

13
ggml.c
View file

@ -1576,11 +1576,11 @@ do { \
// F16 arithmetic is not supported by AVX, so we use F32 instead // F16 arithmetic is not supported by AVX, so we use F32 instead
#define GGML_F32Cx8 __m256 #define GGML_F32Cx8 __m256
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) { static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8]; float tmp[8];
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
@ -1589,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
return (__m256)__lasx_xvld(tmp, 0); return (__m256)__lasx_xvld(tmp, 0);
} }
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) { static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
float arr[8]; float arr[8];
__lasx_xvst(y, arr, 0); __lasx_xvst(y, arr, 0);
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++) {
x[i] = GGML_FP32_TO_FP16(arr[i]); x[i] = GGML_FP32_TO_FP16(arr[i]);
}
} }
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@ -1671,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F16_STEP 32 #define GGML_F16_STEP 32
#define GGML_F16_EPR 4 #define GGML_F16_EPR 4
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) { static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
float tmp[4]; float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]); tmp[0] = GGML_FP16_TO_FP32(x[0]);
@ -1682,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
return __lsx_vld(tmp, 0); return __lsx_vld(tmp, 0);
} }
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) { static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
float arr[4]; float arr[4];
__lsx_vst(y, arr, 0); __lsx_vst(y, arr, 0);

View file

@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench")
# Properties by which to differentiate results per commit: # Properties by which to differentiate results per commit:
KEY_PROPERTIES = [ KEY_PROPERTIES = [
"cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas", "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
"type_k", "type_v", "no_kv_offload", "tensor_split", "n_prompt", "n_gen" "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
] ]
# Properties that are boolean and are converted to Yes/No for the table: # Properties that are boolean and are converted to Yes/No for the table:
BOOL_PROPERTIES = ["cuda", "opencl", "metal", "gpu_blas", "blas"] BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
# Header names for the table: # Header names for the table:
PRETTY_NAMES = { PRETTY_NAMES = {
"cuda": "CUDA", "opencl": "OpenCL", "metal": "Metal", "gpu_blas": "GPU BLAS", "blas": "BLAS", "cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
"cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model", "gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters", "model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
"n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
"n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO", "main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split",
"tensor_split": "Tensor split" "use_mmap": "Use mmap", "embeddings": "Embeddings",
} }
DEFAULT_SHOW = ["model_type"] # Always show these properties by default. DEFAULT_SHOW = ["model_type"] # Always show these properties by default.

View file

@ -129,8 +129,11 @@ llama_target_and_test(test-rope.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
endif()
# dummy executable - not installed # dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE) get_filename_component(TEST_TARGET test-c.c NAME_WE)