Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile
This commit is contained in:
commit
84b28c4282
6 changed files with 1099 additions and 1096 deletions
|
@ -48,7 +48,7 @@ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA ke
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ if (LLAMA_CUBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
add_compile_definitions(GGML_CUDA_F16)
|
add_compile_definitions(GGML_CUDA_F16)
|
||||||
endif()
|
endif()
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
@ -103,7 +103,7 @@ if (LLAMA_CUBLAS)
|
||||||
# 60 == f16 CUDA intrinsics
|
# 60 == f16 CUDA intrinsics
|
||||||
# 61 == integer CUDA intrinsics
|
# 61 == integer CUDA intrinsics
|
||||||
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -42,7 +42,7 @@ endif
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
|
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
|
||||||
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
|
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
# these are used on windows, to build some libraries with extra old device compatibility
|
# these are used on windows, to build some libraries with extra old device compatibility
|
||||||
|
@ -188,11 +188,11 @@ ifdef LLAMA_CUDA_CCBIN
|
||||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||||
endif
|
endif
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
|
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -3,12 +3,11 @@
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
||||||
|
<meta name="color-scheme" content="light dark">
|
||||||
<title>llama.cpp - chat</title>
|
<title>llama.cpp - chat</title>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
body {
|
body {
|
||||||
background-color: #fff;
|
|
||||||
color: #000;
|
|
||||||
font-family: system-ui;
|
font-family: system-ui;
|
||||||
font-size: 90%;
|
font-size: 90%;
|
||||||
}
|
}
|
||||||
|
|
33
ggml-metal.m
33
ggml-metal.m
|
@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
|
||||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
||||||
|
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
GGML_ASSERT(ne02 == ne12);
|
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
||||||
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) &&
|
if (ggml_is_contiguous(src0) &&
|
||||||
ggml_is_contiguous(src1) &&
|
ggml_is_contiguous(src1) &&
|
||||||
|
@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
|
||||||
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
||||||
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
||||||
|
|
||||||
// we need to do ne02 multiplications
|
// we need to do ne12 multiplications
|
||||||
// TODO: is there a way to do this in parallel - currently very slow ..
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
||||||
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
||||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
||||||
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
||||||
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
||||||
size_t offs_dst_cur = offs_dst + i02*nb2;
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
||||||
|
|
||||||
|
@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
|
||||||
switch (src0t) {
|
switch (src0t) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne02 == ne12);
|
|
||||||
|
|
||||||
nth0 = 64;
|
nth0 = 64;
|
||||||
nth1 = 1;
|
nth1 = 1;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||||
|
@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
|
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
|
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
||||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
|
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
||||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
||||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||||
|
|
|
@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
constant uint64_t & nb00,
|
constant uint64_t & nb00,
|
||||||
constant uint64_t & nb01,
|
constant uint64_t & nb01,
|
||||||
constant uint64_t & nb02,
|
constant uint64_t & nb02,
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
constant uint64_t & nb10,
|
constant uint64_t & nb10,
|
||||||
constant uint64_t & nb11,
|
constant uint64_t & nb11,
|
||||||
constant uint64_t & nb12,
|
constant uint64_t & nb12,
|
||||||
|
@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
const int64_t im = tgpig.z;
|
const int64_t im = tgpig.z;
|
||||||
|
|
||||||
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||||
|
|
||||||
sum[tpitg.x] = 0.0f;
|
sum[tpitg.x] = 0.0f;
|
||||||
|
@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
kernel void kernel_alibi_f32(
|
kernel void kernel_alibi_f32(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue