Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile
This commit is contained in:
commit
84b28c4282
6 changed files with 1099 additions and 1096 deletions
|
@ -48,7 +48,7 @@ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA ke
|
|||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
|
||||
|
@ -87,7 +87,7 @@ if (LLAMA_CUBLAS)
|
|||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
if (LLAMA_CUDA_DMMV_F16)
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
add_compile_definitions(GGML_CUDA_F16)
|
||||
endif()
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
|
@ -103,7 +103,7 @@ if (LLAMA_CUBLAS)
|
|||
# 60 == f16 CUDA intrinsics
|
||||
# 61 == integer CUDA intrinsics
|
||||
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
if (LLAMA_CUDA_DMMV_F16)
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
|
|
8
Makefile
8
Makefile
|
@ -42,7 +42,7 @@ endif
|
|||
|
||||
# keep standard at C11 and C++11
|
||||
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
|
||||
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
|
||||
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
|
||||
LDFLAGS =
|
||||
|
||||
# these are used on windows, to build some libraries with extra old device compatibility
|
||||
|
@ -188,11 +188,11 @@ ifdef LLAMA_CUDA_CCBIN
|
|||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif # LLAMA_CUBLAS
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -3,12 +3,11 @@
|
|||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
||||
<meta name="color-scheme" content="light dark">
|
||||
<title>llama.cpp - chat</title>
|
||||
|
||||
<style>
|
||||
body {
|
||||
background-color: #fff;
|
||||
color: #000;
|
||||
font-family: system-ui;
|
||||
font-size: 90%;
|
||||
}
|
||||
|
|
33
ggml-metal.m
33
ggml-metal.m
|
@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
|
|||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
|
@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
|
|||
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
||||
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
||||
|
||||
// we need to do ne02 multiplications
|
||||
// we need to do ne12 multiplications
|
||||
// TODO: is there a way to do this in parallel - currently very slow ..
|
||||
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
||||
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
||||
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
||||
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
||||
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
||||
size_t offs_dst_cur = offs_dst + i02*nb2;
|
||||
|
||||
|
@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
|
|||
switch (src0t) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
|
@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
|
|||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||
|
|
|
@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
|
@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|||
const int64_t r1 = tgpig.y;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
||||
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
sum[tpitg.x] = 0.0f;
|
||||
|
@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void kernel_alibi_f32(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue