Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
This commit is contained in:
Concedo 2023-08-01 18:13:27 +08:00
commit 84b28c4282
6 changed files with 1099 additions and 1096 deletions

View file

@ -48,7 +48,7 @@ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA ke
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@ -87,7 +87,7 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (LLAMA_CUDA_DMMV_F16) if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_F16) add_compile_definitions(GGML_CUDA_F16)
endif() endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -103,7 +103,7 @@ if (LLAMA_CUBLAS)
# 60 == f16 CUDA intrinsics # 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics # 61 == integer CUDA intrinsics
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_DMMV_F16) if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else() else()
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics

View file

@ -42,7 +42,7 @@ endif
# keep standard at C11 and C++11 # keep standard at C11 and C++11
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
LDFLAGS = LDFLAGS =
# these are used on windows, to build some libraries with extra old device compatibility # these are used on windows, to build some libraries with extra old device compatibility
@ -188,11 +188,11 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL

File diff suppressed because it is too large Load diff

View file

@ -3,12 +3,11 @@
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" /> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<meta name="color-scheme" content="light dark">
<title>llama.cpp - chat</title> <title>llama.cpp - chat</title>
<style> <style>
body { body {
background-color: #fff;
color: #000;
font-family: system-ui; font-family: system-ui;
font-size: 90%; font-size: 90%;
} }

View file

@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
GGML_ASSERT(ne02 == ne12); // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
GGML_ASSERT(ne03 == ne13);
if (ggml_is_contiguous(src0) && if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) && ggml_is_contiguous(src1) &&
@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
initWithDevice:ctx->device transposeLeft:false transposeRight:true initWithDevice:ctx->device transposeLeft:false transposeRight:true
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
// we need to do ne02 multiplications // we need to do ne12 multiplications
// TODO: is there a way to do this in parallel - currently very slow .. // TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for (int64_t i02 = 0; i02 < ne02; ++i02) { for (int64_t i02 = 0; i02 < ne12; ++i02) {
size_t offs_src0_cur = offs_src0 + i02*nb02; size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
size_t offs_src1_cur = offs_src1 + i02*nb12; size_t offs_src1_cur = offs_src1 + i02*nb12;
size_t offs_dst_cur = offs_dst + i02*nb2; size_t offs_dst_cur = offs_dst + i02*nb2;
@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
switch (src0t) { switch (src0t) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
GGML_ASSERT(ne02 == ne12);
nth0 = 64; nth0 = 64;
nth1 = 1; nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6]; [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9]; [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {

View file

@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00, constant uint64_t & nb00,
constant uint64_t & nb01, constant uint64_t & nb01,
constant uint64_t & nb02, constant uint64_t & nb02,
constant int64_t & ne10, constant int64_t & ne10,
constant int64_t & ne11, constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10, constant uint64_t & nb10,
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z; const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02); device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f; sum[tpitg.x] = 0.0f;
@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
} }
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
device const float * src0, device const float * src0,
device float * dst, device float * dst,