Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile # README.md # ggml.c
This commit is contained in:
commit
4605074245
5 changed files with 561 additions and 793 deletions
15
.gitignore
vendored
15
.gitignore
vendored
|
@ -1,11 +1,15 @@
|
||||||
*.o
|
*.o
|
||||||
*.a
|
*.a
|
||||||
|
.DS_Store
|
||||||
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
.direnv/
|
||||||
|
.envrc
|
||||||
|
.swiftpm
|
||||||
|
.venv
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
.DS_Store
|
|
||||||
|
|
||||||
.build/
|
|
||||||
build/
|
build/
|
||||||
build-em/
|
build-em/
|
||||||
build-debug/
|
build-debug/
|
||||||
|
@ -30,15 +34,12 @@ models/*
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
||||||
.envrc
|
|
||||||
.direnv/
|
|
||||||
|
|
||||||
.venv
|
|
||||||
__pycache__
|
__pycache__
|
||||||
.swiftpm
|
|
||||||
|
|
||||||
dist/
|
dist/
|
||||||
*.spec
|
*.spec
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
|
ppl-*.txt
|
||||||
|
|
30
Makefile
30
Makefile
|
@ -1,3 +1,8 @@
|
||||||
|
default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
|
||||||
|
simple: koboldcpp koboldcpp_noavx2
|
||||||
|
dev: koboldcpp_openblas
|
||||||
|
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
endif
|
endif
|
||||||
|
@ -117,6 +122,9 @@ endif
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
|
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
|
||||||
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64
|
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64
|
||||||
|
OBJS += ggml-cuda.o
|
||||||
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
|
nvcc -arch=native -c -o $@ $<
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_GPROF
|
ifdef LLAMA_GPROF
|
||||||
CFLAGS += -pg
|
CFLAGS += -pg
|
||||||
|
@ -184,10 +192,6 @@ $(info I CC: $(CCV))
|
||||||
$(info I CXX: $(CXXV))
|
$(info I CXX: $(CXXV))
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
|
|
||||||
simple: koboldcpp koboldcpp_noavx2
|
|
||||||
dev: koboldcpp_openblas
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
@ -234,7 +238,7 @@ gpttype_adapter.o: gpttype_adapter.cpp
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so gptj.exe gpt2.exe
|
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so gptj.exe gpt2.exe
|
||||||
|
|
||||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
|
@ -258,32 +262,32 @@ koboldcpp_clblast: ggml_clblast.o ggml_rwkv.o ggml_v1.o expose.o common.o llama_
|
||||||
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
|
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
quantize_gptj: ggml.o llama.o otherarch/tools/gptj_quantize.cpp
|
quantize_gptj: ggml.o llama.o otherarch/tools/gptj_quantize.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize_gpt2: ggml.o llama.o otherarch/tools/gpt2_quantize.cpp
|
quantize_gpt2: ggml.o llama.o otherarch/tools/gpt2_quantize.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
libllama.so: llama.o ggml.o
|
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
||||||
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
|
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||||
./benchmark-q4_0-matmult
|
./benchmark-q4_0-matmult
|
||||||
|
|
||||||
|
|
116
ggml-cuda.cu
Normal file
116
ggml-cuda.cu
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
|
||||||
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
typedef struct {
|
||||||
|
float d; // delta
|
||||||
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
|
} block_q4_0;
|
||||||
|
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK4_1 32
|
||||||
|
typedef struct {
|
||||||
|
float d; // delta
|
||||||
|
float m; // min
|
||||||
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||||
|
} block_q4_1;
|
||||||
|
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||||
|
|
||||||
|
#define QK4_2 16
|
||||||
|
typedef struct {
|
||||||
|
__half d; // delta
|
||||||
|
uint8_t qs[QK4_2 / 2]; // nibbles / quants
|
||||||
|
} block_q4_2;
|
||||||
|
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
||||||
|
|
||||||
|
|
||||||
|
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
|
||||||
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
const float d = x[i].d;
|
||||||
|
|
||||||
|
const uint8_t * pp = x[i].qs;
|
||||||
|
|
||||||
|
for (int l = 0; l < QK4_0; l += 2) {
|
||||||
|
const uint8_t vi = pp[l/2];
|
||||||
|
|
||||||
|
const int8_t vi0 = vi & 0xf;
|
||||||
|
const int8_t vi1 = vi >> 4;
|
||||||
|
|
||||||
|
const float v0 = (vi0 - 8)*d;
|
||||||
|
const float v1 = (vi1 - 8)*d;
|
||||||
|
|
||||||
|
y[i*QK4_0 + l + 0] = v0;
|
||||||
|
y[i*QK4_0 + l + 1] = v1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
|
||||||
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
const float d = x[i].d;
|
||||||
|
const float m = x[i].m;
|
||||||
|
|
||||||
|
const uint8_t * pp = x[i].qs;
|
||||||
|
|
||||||
|
for (int l = 0; l < QK4_1; l += 2) {
|
||||||
|
const uint8_t vi = pp[l/2];
|
||||||
|
|
||||||
|
const int8_t vi0 = vi & 0xf;
|
||||||
|
const int8_t vi1 = vi >> 4;
|
||||||
|
|
||||||
|
const float v0 = vi0*d + m;
|
||||||
|
const float v1 = vi1*d + m;
|
||||||
|
|
||||||
|
y[i*QK4_1 + l + 0] = v0;
|
||||||
|
y[i*QK4_1 + l + 1] = v1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
|
||||||
|
const block_q4_2 * x = (const block_q4_2 *) vx;
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
const float d = x[i].d;
|
||||||
|
|
||||||
|
const uint8_t * pp = x[i].qs;
|
||||||
|
|
||||||
|
for (int l = 0; l < QK4_2; l += 2) {
|
||||||
|
const uint8_t vi = pp[l/2];
|
||||||
|
|
||||||
|
const int8_t vi0 = vi & 0xf;
|
||||||
|
const int8_t vi1 = vi >> 4;
|
||||||
|
|
||||||
|
const float v0 = (vi0 - 8)*d;
|
||||||
|
const float v1 = (vi1 - 8)*d;
|
||||||
|
|
||||||
|
y[i*QK4_2 + l + 0] = v0;
|
||||||
|
y[i*QK4_2 + l + 1] = v1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
__host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||||
|
const int nb = k / QK4_0;
|
||||||
|
dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||||
|
const int nb = k / QK4_1;
|
||||||
|
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||||
|
const int nb = k / QK4_2;
|
||||||
|
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
|
||||||
|
}
|
||||||
|
}
|
11
ggml-cuda.h
Normal file
11
ggml-cuda.h
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||||
|
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||||
|
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
Loading…
Add table
Add a link
Reference in a new issue