Merge branch 'master' of https://github.com/ggerganov/llama.cpp into finetune_enableGpu
This commit is contained in:
commit
3af7756042
31 changed files with 3228 additions and 2899 deletions
2
.github/ISSUE_TEMPLATE/bug.md
vendored
2
.github/ISSUE_TEMPLATE/bug.md
vendored
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
name: Bug template
|
name: Bug template
|
||||||
about: Used to report bugs in llama.cpp
|
about: Used to report bugs in llama.cpp
|
||||||
labels: ["bug"]
|
labels: ["bug-unconfirmed"]
|
||||||
assignees: ''
|
assignees: ''
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
|
@ -93,7 +94,6 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
|
@ -277,13 +277,8 @@ if (LLAMA_BLAS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_K_QUANTS)
|
if (LLAMA_QKK_64)
|
||||||
set(GGML_HEADERS_EXTRA k_quants.h)
|
add_compile_definitions(GGML_QKK_64)
|
||||||
set(GGML_SOURCES_EXTRA k_quants.c)
|
|
||||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
|
||||||
if (LLAMA_QKK_64)
|
|
||||||
add_compile_definitions(GGML_QKK_64)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
|
@ -305,6 +300,9 @@ if (LLAMA_CUBLAS)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
|
@ -405,6 +403,9 @@ if (LLAMA_HIPBLAS)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
@ -666,6 +667,8 @@ add_library(ggml OBJECT
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
ggml-backend.c
|
ggml-backend.c
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
|
ggml-quants.c
|
||||||
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
|
|
27
Makefile
27
Makefile
|
@ -342,13 +342,9 @@ else
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef LLAMA_NO_K_QUANTS
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
|
|
||||||
OBJS += k_quants.o
|
|
||||||
ifdef LLAMA_QKK_64
|
ifdef LLAMA_QKK_64
|
||||||
MK_CPPFLAGS += -DGGML_QKK_64
|
MK_CPPFLAGS += -DGGML_QKK_64
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
|
@ -365,7 +361,7 @@ ifdef LLAMA_MPI
|
||||||
MK_CPPFLAGS += -DGGML_USE_MPI
|
MK_CPPFLAGS += -DGGML_USE_MPI
|
||||||
MK_CFLAGS += -Wno-cast-qual
|
MK_CFLAGS += -Wno-cast-qual
|
||||||
MK_CXXFLAGS += -Wno-cast-qual
|
MK_CXXFLAGS += -Wno-cast-qual
|
||||||
OBJS += ggml-mpi.o
|
OBJS += ggml-mpi.o
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef LLAMA_OPENBLAS
|
||||||
|
@ -382,7 +378,7 @@ endif # LLAMA_BLIS
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||||
ifdef LLAMA_CUDA_NVCC
|
ifdef LLAMA_CUDA_NVCC
|
||||||
NVCC = $(LLAMA_CUDA_NVCC)
|
NVCC = $(LLAMA_CUDA_NVCC)
|
||||||
|
@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
|
endif # LLAMA_CUDA_FORCE_MMQ
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
|
@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
ifndef LLAMA_NO_K_QUANTS
|
|
||||||
k_quants.o: k_quants.c k_quants.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
endif # LLAMA_NO_K_QUANTS
|
|
||||||
|
|
||||||
# combine build flags with cmdline overrides
|
# combine build flags with cmdline overrides
|
||||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||||
|
@ -539,15 +533,18 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||||
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
|
COMMON_DEPS = common.o sampling.o grammar-parser.o
|
||||||
|
|
||||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||||
|
|
|
@ -42,13 +42,12 @@ let package = Package(
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"ggml-backend.c",
|
"ggml-backend.c",
|
||||||
"k_quants.c",
|
"ggml-quants.c",
|
||||||
] + additionalSources,
|
] + additionalSources,
|
||||||
resources: resources,
|
resources: resources,
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.define("GGML_USE_K_QUANTS"),
|
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE")
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
|
21
build.zig
21
build.zig
|
@ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
var make = try Maker.init(b);
|
var make = try Maker.init(b);
|
||||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||||
|
|
||||||
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
|
|
||||||
try make.addFlag("-DGGML_USE_K_QUANTS");
|
|
||||||
const k_quants = make.obj("k_quants", "k_quants.c");
|
|
||||||
try make.objs.append(k_quants);
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml = make.obj("ggml", "ggml.c");
|
const ggml = make.obj("ggml", "ggml.c");
|
||||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
const console = make.obj("console", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
|
@ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
|
@ -218,12 +218,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--min-p") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.min_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.temp = std::stof(argv[i]);
|
sparams.temp = std::stof(argv[i]);
|
||||||
|
sparams.temp = std::max(sparams.temp, 0.0f);
|
||||||
} else if (arg == "--tfs") {
|
} else if (arg == "--tfs") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -678,6 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||||
|
@ -743,7 +751,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
|
@ -888,7 +896,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
|
|
||||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||||
llama_kv_cache_tokens_rm(lctx, -1, -1);
|
llama_kv_cache_clear(lctx);
|
||||||
llama_reset_timings(lctx);
|
llama_reset_timings(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1274,6 +1282,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
|
@ -110,6 +110,7 @@ llama_token llama_sampling_sample(
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
|
@ -167,8 +168,12 @@ llama_token llama_sampling_sample(
|
||||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (temp <= 0) {
|
if (temp < 0.0) {
|
||||||
// greedy sampling
|
// greedy sampling, with probs
|
||||||
|
llama_sample_softmax(ctx_main, &cur_p);
|
||||||
|
id = cur_p.data[0].id;
|
||||||
|
} else if (temp == 0.0) {
|
||||||
|
// greedy sampling, no probs
|
||||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||||
} else {
|
} else {
|
||||||
if (mirostat == 1) {
|
if (mirostat == 1) {
|
||||||
|
@ -186,6 +191,7 @@ llama_token llama_sampling_sample(
|
||||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
||||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||||
|
|
||||||
id = llama_sample_token(ctx_main, &cur_p);
|
id = llama_sample_token(ctx_main, &cur_p);
|
||||||
|
|
|
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
|
21
convert.py
21
convert.py
|
@ -366,16 +366,19 @@ class SentencePieceVocab:
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
|
||||||
actual_ids = sorted(added_tokens.values())
|
|
||||||
if expected_ids != actual_ids:
|
|
||||||
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
self.vocab_size_base: int = vocab_size
|
actual_new_ids = sorted(new_tokens.keys())
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
if expected_new_ids != actual_new_ids:
|
||||||
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
|
|
@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_pp_start = ggml_time_us();
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
|
|
@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
test t(inst, lmodel, ctx);
|
test t(inst, lmodel, ctx);
|
||||||
|
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
// warmup run
|
// warmup run
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < params.reps; i++) {
|
for (int i = 0; i < params.reps; i++) {
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
uint64_t t_start = get_time_ns();
|
uint64_t t_start = get_time_ns();
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
|
|
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
|
||||||
|
|
||||||
Example usage: `--top-p 0.95`
|
Example usage: `--top-p 0.95`
|
||||||
|
|
||||||
|
### Min P Sampling
|
||||||
|
|
||||||
|
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
|
||||||
|
|
||||||
|
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
|
||||||
|
|
||||||
|
Example usage: `--min-p 0.05`
|
||||||
|
|
||||||
### Tail Free Sampling (TFS)
|
### Tail Free Sampling (TFS)
|
||||||
|
|
||||||
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||||
|
|
|
@ -298,7 +298,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove any "future" tokens that we might have inherited from the previous session
|
// remove any "future" tokens that we might have inherited from the previous session
|
||||||
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOGLN(
|
||||||
|
|
|
@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
for (int j = 0; j < num_batches; ++j) {
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
|
@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
for (int j = 0; j < num_batches; ++j) {
|
for (int j = 0; j < num_batches; ++j) {
|
||||||
const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
|
@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
||||||
if (logits.empty()) {
|
if (logits.empty()) {
|
||||||
|
|
|
@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||||
|
@ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||||
#endif
|
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
|
@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||||
}
|
}
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
|
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||||
printf("\nAllowed quantization types:\n");
|
printf("\nAllowed quantization types:\n");
|
||||||
for (auto & it : QUANT_OPTIONS) {
|
for (auto & it : QUANT_OPTIONS) {
|
||||||
if (it.name != "COPY") {
|
if (it.name != "COPY") {
|
||||||
|
@ -103,6 +102,8 @@ int main(int argc, char ** argv) {
|
||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||||
params.allow_requantize = true;
|
params.allow_requantize = true;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||||
|
params.pure = true;
|
||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -857,7 +857,7 @@ struct llama_server_context
|
||||||
|
|
||||||
void kv_cache_clear() {
|
void kv_cache_clear() {
|
||||||
// clear the entire KV cache
|
// clear the entire KV cache
|
||||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
llama_kv_cache_clear(ctx);
|
||||||
clean_kv_cache = false;
|
clean_kv_cache = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1502,7 +1502,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||||
|
|
||||||
// empty prompt passed -> release the slot and send empty response
|
// empty prompt passed -> release the slot and send empty response
|
||||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||||
|
|
|
@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
batch.n_tokens = tokens_list.size();
|
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||||
|
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
|
||||||
batch.token[i] = tokens_list[i];
|
|
||||||
batch.pos[i] = i;
|
|
||||||
batch.seq_id[i] = 0;
|
|
||||||
batch.logits[i] = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
batch.n_tokens = 0;
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
batch.token [batch.n_tokens] = new_token_id;
|
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||||
batch.pos [batch.n_tokens] = n_cur;
|
|
||||||
batch.seq_id[batch.n_tokens] = 0;
|
|
||||||
batch.logits[batch.n_tokens] = true;
|
|
||||||
|
|
||||||
batch.n_tokens += 1;
|
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,9 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||||
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
|
|
||||||
struct seq_draft {
|
struct seq_draft {
|
||||||
bool active = false;
|
bool active = false;
|
||||||
bool drafting = false;
|
bool drafting = false;
|
||||||
|
@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
{
|
||||||
|
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||||
|
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||||
|
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||||
|
? n_vocab_tgt - n_vocab_dft
|
||||||
|
: n_vocab_dft - n_vocab_tgt;
|
||||||
|
|
||||||
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||||
|
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||||
|
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||||
|
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||||
|
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||||
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||||
|
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||||
|
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||||
|
llama_token_to_piece(ctx_dft, i).c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||||
|
@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
|
@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
|
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||||
llama_decode (ctx_dft, batch_dft);
|
llama_decode (ctx_dft, batch_dft);
|
||||||
|
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||||
llama_decode(ctx_tgt, batch_tgt);
|
llama_decode(ctx_tgt, batch_tgt);
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
}
|
}
|
||||||
|
|
12
flake.lock
generated
12
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"systems": "systems"
|
"systems": "systems"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1692799911,
|
"lastModified": 1694529238,
|
||||||
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
|
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "flake-utils",
|
"repo": "flake-utils",
|
||||||
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
|
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1692913444,
|
"lastModified": 1698318101,
|
||||||
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
"narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
"rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
17
flake.nix
17
flake.nix
|
@ -11,8 +11,7 @@
|
||||||
meta.mainProgram = "llama";
|
meta.mainProgram = "llama";
|
||||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||||
buildInputs = with pkgs; [ openmpi ];
|
buildInputs = with pkgs; [ openmpi ];
|
||||||
osSpecific = with pkgs; buildInputs ++
|
osSpecific = with pkgs; buildInputs ++ (
|
||||||
(
|
|
||||||
if isAarch64 && isDarwin then
|
if isAarch64 && isDarwin then
|
||||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
Accelerate
|
Accelerate
|
||||||
|
@ -51,6 +50,9 @@
|
||||||
};
|
};
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
|
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||||
|
llama-python-extra =
|
||||||
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
|
@ -93,12 +95,15 @@
|
||||||
};
|
};
|
||||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||||
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
|
||||||
cmakeFlags = cmakeFlags ++ [
|
cmakeFlags = cmakeFlags ++ [
|
||||||
"-DLLAMA_HIPBLAS=1"
|
"-DLLAMA_HIPBLAS=1"
|
||||||
"-DCMAKE_C_COMPILER=hipcc"
|
"-DCMAKE_C_COMPILER=hipcc"
|
||||||
"-DCMAKE_CXX_COMPILER=hipcc"
|
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||||
|
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||||
|
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||||
|
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
apps.llama-server = {
|
apps.llama-server = {
|
||||||
|
@ -126,5 +131,9 @@
|
||||||
buildInputs = [ llama-python ];
|
buildInputs = [ llama-python ];
|
||||||
packages = nativeBuildInputs ++ osSpecific;
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
};
|
};
|
||||||
|
devShells.extra = pkgs.mkShell {
|
||||||
|
buildInputs = [ llama-python-extra ];
|
||||||
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
130
ggml-cuda.cu
130
ggml-cuda.cu
|
@ -87,6 +87,24 @@
|
||||||
#define CC_OFFSET_AMD 1000000
|
#define CC_OFFSET_AMD 1000000
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||||
|
|
||||||
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||||
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||||
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||||
|
// - 7B quantum model: +100-200 MB
|
||||||
|
// - 13B quantum model: +200-400 MB
|
||||||
|
//
|
||||||
|
//#define GGML_CUDA_FORCE_MMQ
|
||||||
|
|
||||||
|
// TODO: improve this to be correct for more hardware
|
||||||
|
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||||
|
// probably other such cases, and not sure what happens on AMD hardware
|
||||||
|
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||||
|
#define CUDA_USE_TENSOR_CORES
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// max batch size to use MMQ kernels when tensor cores are available
|
||||||
|
#define MMQ_MAX_BATCH_SIZE 32
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
@ -470,7 +488,6 @@ static int g_device_count = -1;
|
||||||
static int g_main_device = 0;
|
static int g_main_device = 0;
|
||||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static bool g_mul_mat_q = true;
|
|
||||||
|
|
||||||
static void * g_scratch_buffer = nullptr;
|
static void * g_scratch_buffer = nullptr;
|
||||||
static size_t g_scratch_size = 0; // disabled by default
|
static size_t g_scratch_size = 0; // disabled by default
|
||||||
|
@ -3563,9 +3580,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||||
#define MMQ_X_Q4_0_RDNA1 64
|
#define MMQ_X_Q4_0_RDNA1 64
|
||||||
#define MMQ_Y_Q4_0_RDNA1 64
|
#define MMQ_Y_Q4_0_RDNA1 64
|
||||||
#define NWARPS_Q4_0_RDNA1 8
|
#define NWARPS_Q4_0_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q4_0_AMPERE 4
|
||||||
|
#define MMQ_Y_Q4_0_AMPERE 32
|
||||||
|
#define NWARPS_Q4_0_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q4_0_AMPERE 64
|
#define MMQ_X_Q4_0_AMPERE 64
|
||||||
#define MMQ_Y_Q4_0_AMPERE 128
|
#define MMQ_Y_Q4_0_AMPERE 128
|
||||||
#define NWARPS_Q4_0_AMPERE 4
|
#define NWARPS_Q4_0_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q4_0_PASCAL 64
|
#define MMQ_X_Q4_0_PASCAL 64
|
||||||
#define MMQ_Y_Q4_0_PASCAL 64
|
#define MMQ_Y_Q4_0_PASCAL 64
|
||||||
#define NWARPS_Q4_0_PASCAL 8
|
#define NWARPS_Q4_0_PASCAL 8
|
||||||
|
@ -3624,9 +3647,15 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_X_Q4_1_RDNA1 64
|
#define MMQ_X_Q4_1_RDNA1 64
|
||||||
#define MMQ_Y_Q4_1_RDNA1 64
|
#define MMQ_Y_Q4_1_RDNA1 64
|
||||||
#define NWARPS_Q4_1_RDNA1 8
|
#define NWARPS_Q4_1_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q4_1_AMPERE 4
|
||||||
|
#define MMQ_Y_Q4_1_AMPERE 32
|
||||||
|
#define NWARPS_Q4_1_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q4_1_AMPERE 64
|
#define MMQ_X_Q4_1_AMPERE 64
|
||||||
#define MMQ_Y_Q4_1_AMPERE 128
|
#define MMQ_Y_Q4_1_AMPERE 128
|
||||||
#define NWARPS_Q4_1_AMPERE 4
|
#define NWARPS_Q4_1_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q4_1_PASCAL 64
|
#define MMQ_X_Q4_1_PASCAL 64
|
||||||
#define MMQ_Y_Q4_1_PASCAL 64
|
#define MMQ_Y_Q4_1_PASCAL 64
|
||||||
#define NWARPS_Q4_1_PASCAL 8
|
#define NWARPS_Q4_1_PASCAL 8
|
||||||
|
@ -3687,9 +3716,15 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_X_Q5_0_RDNA1 64
|
#define MMQ_X_Q5_0_RDNA1 64
|
||||||
#define MMQ_Y_Q5_0_RDNA1 64
|
#define MMQ_Y_Q5_0_RDNA1 64
|
||||||
#define NWARPS_Q5_0_RDNA1 8
|
#define NWARPS_Q5_0_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q5_0_AMPERE 4
|
||||||
|
#define MMQ_Y_Q5_0_AMPERE 32
|
||||||
|
#define NWARPS_Q5_0_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q5_0_AMPERE 128
|
#define MMQ_X_Q5_0_AMPERE 128
|
||||||
#define MMQ_Y_Q5_0_AMPERE 64
|
#define MMQ_Y_Q5_0_AMPERE 64
|
||||||
#define NWARPS_Q5_0_AMPERE 4
|
#define NWARPS_Q5_0_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q5_0_PASCAL 64
|
#define MMQ_X_Q5_0_PASCAL 64
|
||||||
#define MMQ_Y_Q5_0_PASCAL 64
|
#define MMQ_Y_Q5_0_PASCAL 64
|
||||||
#define NWARPS_Q5_0_PASCAL 8
|
#define NWARPS_Q5_0_PASCAL 8
|
||||||
|
@ -3748,9 +3783,15 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_X_Q5_1_RDNA1 64
|
#define MMQ_X_Q5_1_RDNA1 64
|
||||||
#define MMQ_Y_Q5_1_RDNA1 64
|
#define MMQ_Y_Q5_1_RDNA1 64
|
||||||
#define NWARPS_Q5_1_RDNA1 8
|
#define NWARPS_Q5_1_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q5_1_AMPERE 4
|
||||||
|
#define MMQ_Y_Q5_1_AMPERE 32
|
||||||
|
#define NWARPS_Q5_1_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q5_1_AMPERE 128
|
#define MMQ_X_Q5_1_AMPERE 128
|
||||||
#define MMQ_Y_Q5_1_AMPERE 64
|
#define MMQ_Y_Q5_1_AMPERE 64
|
||||||
#define NWARPS_Q5_1_AMPERE 4
|
#define NWARPS_Q5_1_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q5_1_PASCAL 64
|
#define MMQ_X_Q5_1_PASCAL 64
|
||||||
#define MMQ_Y_Q5_1_PASCAL 64
|
#define MMQ_Y_Q5_1_PASCAL 64
|
||||||
#define NWARPS_Q5_1_PASCAL 8
|
#define NWARPS_Q5_1_PASCAL 8
|
||||||
|
@ -3809,9 +3850,15 @@ mul_mat_q5_1(
|
||||||
#define MMQ_X_Q8_0_RDNA1 64
|
#define MMQ_X_Q8_0_RDNA1 64
|
||||||
#define MMQ_Y_Q8_0_RDNA1 64
|
#define MMQ_Y_Q8_0_RDNA1 64
|
||||||
#define NWARPS_Q8_0_RDNA1 8
|
#define NWARPS_Q8_0_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q8_0_AMPERE 4
|
||||||
|
#define MMQ_Y_Q8_0_AMPERE 32
|
||||||
|
#define NWARPS_Q8_0_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q8_0_AMPERE 128
|
#define MMQ_X_Q8_0_AMPERE 128
|
||||||
#define MMQ_Y_Q8_0_AMPERE 64
|
#define MMQ_Y_Q8_0_AMPERE 64
|
||||||
#define NWARPS_Q8_0_AMPERE 4
|
#define NWARPS_Q8_0_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q8_0_PASCAL 64
|
#define MMQ_X_Q8_0_PASCAL 64
|
||||||
#define MMQ_Y_Q8_0_PASCAL 64
|
#define MMQ_Y_Q8_0_PASCAL 64
|
||||||
#define NWARPS_Q8_0_PASCAL 8
|
#define NWARPS_Q8_0_PASCAL 8
|
||||||
|
@ -3870,9 +3917,15 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_X_Q2_K_RDNA1 128
|
#define MMQ_X_Q2_K_RDNA1 128
|
||||||
#define MMQ_Y_Q2_K_RDNA1 32
|
#define MMQ_Y_Q2_K_RDNA1 32
|
||||||
#define NWARPS_Q2_K_RDNA1 8
|
#define NWARPS_Q2_K_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q2_K_AMPERE 4
|
||||||
|
#define MMQ_Y_Q2_K_AMPERE 32
|
||||||
|
#define NWARPS_Q2_K_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q2_K_AMPERE 64
|
#define MMQ_X_Q2_K_AMPERE 64
|
||||||
#define MMQ_Y_Q2_K_AMPERE 128
|
#define MMQ_Y_Q2_K_AMPERE 128
|
||||||
#define NWARPS_Q2_K_AMPERE 4
|
#define NWARPS_Q2_K_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q2_K_PASCAL 64
|
#define MMQ_X_Q2_K_PASCAL 64
|
||||||
#define MMQ_Y_Q2_K_PASCAL 64
|
#define MMQ_Y_Q2_K_PASCAL 64
|
||||||
#define NWARPS_Q2_K_PASCAL 8
|
#define NWARPS_Q2_K_PASCAL 8
|
||||||
|
@ -3931,9 +3984,15 @@ mul_mat_q2_K(
|
||||||
#define MMQ_X_Q3_K_RDNA1 32
|
#define MMQ_X_Q3_K_RDNA1 32
|
||||||
#define MMQ_Y_Q3_K_RDNA1 128
|
#define MMQ_Y_Q3_K_RDNA1 128
|
||||||
#define NWARPS_Q3_K_RDNA1 8
|
#define NWARPS_Q3_K_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q3_K_AMPERE 4
|
||||||
|
#define MMQ_Y_Q3_K_AMPERE 32
|
||||||
|
#define NWARPS_Q3_K_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q3_K_AMPERE 128
|
#define MMQ_X_Q3_K_AMPERE 128
|
||||||
#define MMQ_Y_Q3_K_AMPERE 128
|
#define MMQ_Y_Q3_K_AMPERE 128
|
||||||
#define NWARPS_Q3_K_AMPERE 4
|
#define NWARPS_Q3_K_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q3_K_PASCAL 64
|
#define MMQ_X_Q3_K_PASCAL 64
|
||||||
#define MMQ_Y_Q3_K_PASCAL 64
|
#define MMQ_Y_Q3_K_PASCAL 64
|
||||||
#define NWARPS_Q3_K_PASCAL 8
|
#define NWARPS_Q3_K_PASCAL 8
|
||||||
|
@ -3994,9 +4053,15 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_X_Q4_K_RDNA1 32
|
#define MMQ_X_Q4_K_RDNA1 32
|
||||||
#define MMQ_Y_Q4_K_RDNA1 64
|
#define MMQ_Y_Q4_K_RDNA1 64
|
||||||
#define NWARPS_Q4_K_RDNA1 8
|
#define NWARPS_Q4_K_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q4_K_AMPERE 4
|
||||||
|
#define MMQ_Y_Q4_K_AMPERE 32
|
||||||
|
#define NWARPS_Q4_K_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q4_K_AMPERE 64
|
#define MMQ_X_Q4_K_AMPERE 64
|
||||||
#define MMQ_Y_Q4_K_AMPERE 128
|
#define MMQ_Y_Q4_K_AMPERE 128
|
||||||
#define NWARPS_Q4_K_AMPERE 4
|
#define NWARPS_Q4_K_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q4_K_PASCAL 64
|
#define MMQ_X_Q4_K_PASCAL 64
|
||||||
#define MMQ_Y_Q4_K_PASCAL 64
|
#define MMQ_Y_Q4_K_PASCAL 64
|
||||||
#define NWARPS_Q4_K_PASCAL 8
|
#define NWARPS_Q4_K_PASCAL 8
|
||||||
|
@ -4057,9 +4122,15 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_X_Q5_K_RDNA1 32
|
#define MMQ_X_Q5_K_RDNA1 32
|
||||||
#define MMQ_Y_Q5_K_RDNA1 64
|
#define MMQ_Y_Q5_K_RDNA1 64
|
||||||
#define NWARPS_Q5_K_RDNA1 8
|
#define NWARPS_Q5_K_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q5_K_AMPERE 4
|
||||||
|
#define MMQ_Y_Q5_K_AMPERE 32
|
||||||
|
#define NWARPS_Q5_K_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q5_K_AMPERE 64
|
#define MMQ_X_Q5_K_AMPERE 64
|
||||||
#define MMQ_Y_Q5_K_AMPERE 128
|
#define MMQ_Y_Q5_K_AMPERE 128
|
||||||
#define NWARPS_Q5_K_AMPERE 4
|
#define NWARPS_Q5_K_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q5_K_PASCAL 64
|
#define MMQ_X_Q5_K_PASCAL 64
|
||||||
#define MMQ_Y_Q5_K_PASCAL 64
|
#define MMQ_Y_Q5_K_PASCAL 64
|
||||||
#define NWARPS_Q5_K_PASCAL 8
|
#define NWARPS_Q5_K_PASCAL 8
|
||||||
|
@ -4118,9 +4189,15 @@ mul_mat_q5_K(
|
||||||
#define MMQ_X_Q6_K_RDNA1 32
|
#define MMQ_X_Q6_K_RDNA1 32
|
||||||
#define MMQ_Y_Q6_K_RDNA1 64
|
#define MMQ_Y_Q6_K_RDNA1 64
|
||||||
#define NWARPS_Q6_K_RDNA1 8
|
#define NWARPS_Q6_K_RDNA1 8
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
#define MMQ_X_Q6_K_AMPERE 4
|
||||||
|
#define MMQ_Y_Q6_K_AMPERE 32
|
||||||
|
#define NWARPS_Q6_K_AMPERE 4
|
||||||
|
#else
|
||||||
#define MMQ_X_Q6_K_AMPERE 64
|
#define MMQ_X_Q6_K_AMPERE 64
|
||||||
#define MMQ_Y_Q6_K_AMPERE 64
|
#define MMQ_Y_Q6_K_AMPERE 64
|
||||||
#define NWARPS_Q6_K_AMPERE 4
|
#define NWARPS_Q6_K_AMPERE 4
|
||||||
|
#endif
|
||||||
#define MMQ_X_Q6_K_PASCAL 64
|
#define MMQ_X_Q6_K_PASCAL 64
|
||||||
#define MMQ_Y_Q6_K_PASCAL 64
|
#define MMQ_Y_Q6_K_PASCAL 64
|
||||||
#define NWARPS_Q6_K_PASCAL 8
|
#define NWARPS_Q6_K_PASCAL 8
|
||||||
|
@ -5677,6 +5754,16 @@ void ggml_init_cublas() {
|
||||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
||||||
|
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||||
|
#endif
|
||||||
|
#if defined(CUDA_USE_TENSOR_CORES)
|
||||||
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
||||||
|
#endif
|
||||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
@ -6364,7 +6451,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
row_diff, src1_ncols, ne10,
|
row_diff, src1_ncols, ne10,
|
||||||
&alpha, src0_ddf_i, ne00,
|
&alpha, src0_ddf_i, ne00,
|
||||||
src1_ddf_i, ne10,
|
src1_ddf_i, ne10,
|
||||||
&beta, dst_dd_i, ldc));
|
&beta, dst_dd_i, ldc));
|
||||||
|
|
||||||
if (src0_as != 0) {
|
if (src0_as != 0) {
|
||||||
|
@ -7065,9 +7152,10 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
@ -7219,17 +7307,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
const bool all_on_device =
|
||||||
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
(src0->backend == GGML_BACKEND_GPU) &&
|
||||||
|
(src1->backend == GGML_BACKEND_GPU) &&
|
||||||
|
( dst->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (min_compute_capability > g_compute_capabilities[id]
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||||
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
|
||||||
min_compute_capability = g_compute_capabilities[id];
|
min_compute_capability = g_compute_capabilities[id];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
|
const bool use_tensor_cores = true;
|
||||||
|
#else
|
||||||
|
const bool use_tensor_cores = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
// debug helpers
|
// debug helpers
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||||
|
@ -7238,20 +7333,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||||
} else if (src0->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F32) {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||||
|
|
||||||
#ifdef GGML_CUDA_FORCE_DMMV
|
#ifdef GGML_CUDA_FORCE_DMMV
|
||||||
const bool use_mul_mat_vec_q = false;
|
const bool use_mul_mat_vec_q = false;
|
||||||
#else
|
#else
|
||||||
|
@ -7264,7 +7358,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
||||||
|
|
||||||
|
// when tensor cores are available, use them for large batch size
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||||
|
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
||||||
|
use_mul_mat_q = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (use_mul_mat_q) {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||||
} else {
|
} else {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||||
|
@ -7618,10 +7720,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
|
||||||
g_mul_mat_q = mul_mat_q;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||||
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
||||||
// it still won't always work as expected, but it's better than nothing
|
// it still won't always work as expected, but it's better than nothing
|
||||||
|
|
237
ggml-impl.h
Normal file
237
ggml-impl.h
Normal file
|
@ -0,0 +1,237 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
// GGML internal header
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <string.h> // memcpy
|
||||||
|
#include <math.h> // fabsf
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// static_assert should be a #define, but if it's not,
|
||||||
|
// fall back to the _Static_assert C11 keyword.
|
||||||
|
// if C99 - static_assert is noop
|
||||||
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||||
|
#ifndef static_assert
|
||||||
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||||
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||||
|
#else
|
||||||
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||||
|
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||||
|
#ifndef __FMA__
|
||||||
|
#define __FMA__
|
||||||
|
#endif
|
||||||
|
#ifndef __F16C__
|
||||||
|
#define __F16C__
|
||||||
|
#endif
|
||||||
|
#ifndef __SSE3__
|
||||||
|
#define __SSE3__
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef MIN
|
||||||
|
#undef MAX
|
||||||
|
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
// 16-bit float
|
||||||
|
// on Arm, we use __fp16
|
||||||
|
// on x86, we use uint16_t
|
||||||
|
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
|
|
||||||
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
|
//
|
||||||
|
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||||
|
//
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
||||||
|
#define GGML_FP32_TO_FP16(x) (x)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifdef __wasm_simd128__
|
||||||
|
#include <wasm_simd128.h>
|
||||||
|
#else
|
||||||
|
#ifdef __POWER9_VECTOR__
|
||||||
|
#include <altivec.h>
|
||||||
|
#undef bool
|
||||||
|
#define bool _Bool
|
||||||
|
#else
|
||||||
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
#include <intrin.h>
|
||||||
|
#else
|
||||||
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||||
|
#if !defined(__riscv)
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __riscv_v_intrinsic
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __F16C__
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||||
|
#else
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif defined(__POWER9_VECTOR__)
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
/* the inline asm below is about 12% faster than the lookup method */
|
||||||
|
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
register float f;
|
||||||
|
register double d;
|
||||||
|
__asm__(
|
||||||
|
"mtfprd %0,%2\n"
|
||||||
|
"xscvhpdp %0,%0\n"
|
||||||
|
"frsp %1,%0\n" :
|
||||||
|
/* temp */ "=d"(d),
|
||||||
|
/* out */ "=f"(f):
|
||||||
|
/* in */ "r"(h));
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
register double d;
|
||||||
|
register ggml_fp16_t r;
|
||||||
|
__asm__( /* xscvdphp can work on double or single precision */
|
||||||
|
"xscvdphp %0,%2\n"
|
||||||
|
"mffprd %1,%0\n" :
|
||||||
|
/* temp */ "=d"(d),
|
||||||
|
/* out */ "=r"(r):
|
||||||
|
/* in */ "f"(f));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// FP16 <-> FP32
|
||||||
|
// ref: https://github.com/Maratyszcza/FP16
|
||||||
|
|
||||||
|
static inline float fp32_from_bits(uint32_t w) {
|
||||||
|
union {
|
||||||
|
uint32_t as_bits;
|
||||||
|
float as_value;
|
||||||
|
} fp32;
|
||||||
|
fp32.as_bits = w;
|
||||||
|
return fp32.as_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t fp32_to_bits(float f) {
|
||||||
|
union {
|
||||||
|
float as_value;
|
||||||
|
uint32_t as_bits;
|
||||||
|
} fp32;
|
||||||
|
fp32.as_value = f;
|
||||||
|
return fp32.as_bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
const uint32_t w = (uint32_t) h << 16;
|
||||||
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
|
const uint32_t two_w = w + w;
|
||||||
|
|
||||||
|
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||||
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||||
|
const float exp_scale = 0x1.0p-112f;
|
||||||
|
#else
|
||||||
|
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||||
|
#endif
|
||||||
|
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||||
|
|
||||||
|
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||||
|
const float magic_bias = 0.5f;
|
||||||
|
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||||
|
|
||||||
|
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||||
|
const uint32_t result = sign |
|
||||||
|
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||||
|
return fp32_from_bits(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||||
|
const float scale_to_inf = 0x1.0p+112f;
|
||||||
|
const float scale_to_zero = 0x1.0p-110f;
|
||||||
|
#else
|
||||||
|
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||||
|
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||||
|
#endif
|
||||||
|
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||||
|
|
||||||
|
const uint32_t w = fp32_to_bits(f);
|
||||||
|
const uint32_t shl1_w = w + w;
|
||||||
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
|
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||||
|
if (bias < UINT32_C(0x71000000)) {
|
||||||
|
bias = UINT32_C(0x71000000);
|
||||||
|
}
|
||||||
|
|
||||||
|
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||||
|
const uint32_t bits = fp32_to_bits(base);
|
||||||
|
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||||
|
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||||
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||||
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
|
||||||
|
#endif // __F16C__
|
||||||
|
|
||||||
|
#endif // __ARM_NEON
|
||||||
|
|
||||||
|
// precomputed f32 table for f16 (256 KB)
|
||||||
|
// defined in ggml.c, initialized in ggml_init()
|
||||||
|
extern float ggml_table_f32_f16[1 << 16];
|
||||||
|
|
||||||
|
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||||
|
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||||
|
// This is also true for POWER9.
|
||||||
|
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||||
|
|
||||||
|
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
|
uint16_t s;
|
||||||
|
memcpy(&s, &f, sizeof(uint16_t));
|
||||||
|
return ggml_table_f32_f16[s];
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: backend v2 PR
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||||
|
if (sourcePath == nil) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||||
|
sourcePath = @"ggml-metal.metal";
|
||||||
|
}
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,11 +1,63 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
|
|
||||||
|
// GGML internal header
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
|
} block_q4_0;
|
||||||
|
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK4_1 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
ggml_fp16_t m; // min
|
||||||
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||||
|
} block_q4_1;
|
||||||
|
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||||
|
|
||||||
|
#define QK5_0 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||||
|
} block_q5_0;
|
||||||
|
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK5_1 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
ggml_fp16_t m; // min
|
||||||
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||||
|
} block_q5_1;
|
||||||
|
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
||||||
|
|
||||||
|
#define QK8_0 32
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // delta
|
||||||
|
int8_t qs[QK8_0]; // quants
|
||||||
|
} block_q8_0;
|
||||||
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK8_1 32
|
||||||
|
typedef struct {
|
||||||
|
float d; // delta
|
||||||
|
float s; // d * sum(qs[i])
|
||||||
|
int8_t qs[QK8_1]; // quants
|
||||||
|
} block_q8_1;
|
||||||
|
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||||
|
|
||||||
|
//
|
||||||
|
// Super-block quantization structures
|
||||||
|
//
|
||||||
|
|
||||||
// Super-block size
|
// Super-block size
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
#define QK_K 64
|
#define QK_K 64
|
||||||
|
@ -15,18 +67,6 @@
|
||||||
#define K_SCALE_SIZE 12
|
#define K_SCALE_SIZE 12
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef static_assert
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
|
||||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
||||||
#else
|
|
||||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
|
||||||
// Super-block quantization structures
|
|
||||||
//
|
|
||||||
|
|
||||||
// 2-bit quantization
|
// 2-bit quantization
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// 16 blocks of 16 elements each
|
// 16 blocks of 16 elements each
|
||||||
|
@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
|
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||||
|
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||||
|
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
||||||
|
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
||||||
|
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
||||||
|
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||||
|
@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||||
|
|
||||||
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
||||||
|
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||||
|
@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
||||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
// Quantization with histogram collection
|
|
||||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
|
|
7
ggml.h
7
ggml.h
|
@ -1930,12 +1930,19 @@ extern "C" {
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
||||||
|
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
||||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
326
llama.cpp
326
llama.cpp
|
@ -19,13 +19,11 @@
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
# include "ggml-mpi.h"
|
# include "ggml-mpi.h"
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifndef QK_K
|
||||||
# ifndef QK_K
|
# ifdef GGML_QKK_64
|
||||||
# ifdef GGML_QKK_64
|
# define QK_K 64
|
||||||
# define QK_K 64
|
# else
|
||||||
# else
|
# define QK_K 256
|
||||||
# define QK_K 256
|
|
||||||
# endif
|
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1468,17 +1466,12 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
||||||
if (c0 < 0) c0 = 0;
|
for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
|
||||||
if (c1 < 0) c1 = cache.size;
|
|
||||||
|
|
||||||
for (int32_t i = c0; i < c1; ++i) {
|
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
}
|
}
|
||||||
|
cache.head = 0;
|
||||||
// Searching for a free slot can start here since we know it will be empty.
|
|
||||||
cache.head = uint32_t(c0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_rm(
|
static void llama_kv_cache_seq_rm(
|
||||||
|
@ -1492,8 +1485,14 @@ static void llama_kv_cache_seq_rm(
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||||
cache.cells[i].seq_id.erase(seq_id);
|
if (seq_id < 0) {
|
||||||
|
cache.cells[i].seq_id.clear();
|
||||||
|
} else if (cache.cells[i].has_seq_id(seq_id)) {
|
||||||
|
cache.cells[i].seq_id.erase(seq_id);
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (cache.cells[i].seq_id.empty()) {
|
if (cache.cells[i].seq_id.empty()) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
if (new_head == cache.size) new_head = i;
|
if (new_head == cache.size) new_head = i;
|
||||||
|
@ -1554,14 +1553,14 @@ static void llama_kv_cache_seq_shift(
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||||
cache.cells[i].pos += delta;
|
cache.has_shift = true;
|
||||||
|
cache.cells[i].pos += delta;
|
||||||
|
cache.cells[i].delta += delta;
|
||||||
|
|
||||||
if (cache.cells[i].pos < 0) {
|
if (cache.cells[i].pos < 0) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
if (new_head == cache.size) new_head = i;
|
if (new_head == cache.size) new_head = i;
|
||||||
} else {
|
|
||||||
cache.has_shift = true;
|
|
||||||
cache.cells[i].delta = delta;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1578,12 +1577,14 @@ static void llama_kv_cache_seq_shift(
|
||||||
enum llama_fver {
|
enum llama_fver {
|
||||||
GGUF_FILE_VERSION_V1 = 1,
|
GGUF_FILE_VERSION_V1 = 1,
|
||||||
GGUF_FILE_VERSION_V2 = 2,
|
GGUF_FILE_VERSION_V2 = 2,
|
||||||
|
GGUF_FILE_VERSION_V3 = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * llama_file_version_name(llama_fver version) {
|
static const char * llama_file_version_name(llama_fver version) {
|
||||||
switch (version) {
|
switch (version) {
|
||||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||||
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
case GGUF_FILE_VERSION_V2: return "GGUF V2";
|
||||||
|
case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
|
||||||
}
|
}
|
||||||
|
|
||||||
return "unknown";
|
return "unknown";
|
||||||
|
@ -2693,8 +2694,8 @@ static void llm_load_tensors(
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
{
|
{
|
||||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
// output
|
// output
|
||||||
{
|
{
|
||||||
|
@ -2745,19 +2746,19 @@ static void llm_load_tensors(
|
||||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||||
|
|
||||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||||
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
|
@ -4614,6 +4615,8 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
|
|
||||||
const float norm_eps = hparams.f_norm_eps;
|
const float norm_eps = hparams.f_norm_eps;
|
||||||
|
|
||||||
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
const int32_t n_tokens = batch.n_tokens;
|
const int32_t n_tokens = batch.n_tokens;
|
||||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
@ -4658,6 +4661,27 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
(void) i_gpu_start;
|
||||||
|
|
||||||
|
// offload functions set the tensor output backend to GPU
|
||||||
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||||
|
offload_func_t offload_func_kq = llama_nop;
|
||||||
|
offload_func_t offload_func_v = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer) {
|
||||||
|
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
{
|
{
|
||||||
// Compute position embeddings.
|
// Compute position embeddings.
|
||||||
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||||
|
@ -4683,6 +4707,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||||
ggml_set_name(KQ_mask, "KQ_mask");
|
ggml_set_name(KQ_mask, "KQ_mask");
|
||||||
|
offload_func_kq(KQ_mask);
|
||||||
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
||||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
float * data = (float *) KQ_mask->data;
|
float * data = (float *) KQ_mask->data;
|
||||||
|
@ -4706,44 +4731,67 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
ggml_set_name(inpL, "inpL");
|
ggml_set_name(inpL, "inpL");
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
offload_func_t offload_func = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (il >= i_gpu_start) {
|
||||||
|
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
{
|
{
|
||||||
// Norm
|
// Norm
|
||||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||||
|
offload_func(cur);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
||||||
|
offload_func(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// Self Attention
|
// Self Attention
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||||
|
offload_func_kq(cur);
|
||||||
|
|
||||||
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||||
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
offload_func_kq(cur);
|
||||||
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = tmpq;
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||||
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
|
struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||||
|
|
||||||
|
ggml_set_name(tmpq, "tmpq");
|
||||||
|
ggml_set_name(tmpk, "tmpk");
|
||||||
|
ggml_set_name(tmpv, "tmpv");
|
||||||
|
|
||||||
|
offload_func_kq(tmpq);
|
||||||
|
offload_func_kq(tmpk);
|
||||||
|
offload_func_v (tmpv);
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
||||||
struct ggml_tensor * Kcur = tmpk;
|
struct ggml_tensor * Kcur = tmpk;
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
|
||||||
|
offload_func_v(Vcur);
|
||||||
ggml_set_name(Vcur, "Vcur");
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
||||||
|
offload_func_kq(k);
|
||||||
ggml_set_name(k, "k");
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
||||||
( n_ctx)*ggml_element_size(kv_self.v),
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
||||||
|
offload_func_v(v);
|
||||||
|
ggml_set_name(v, "v");
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * Q =
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
ggml_permute(ctx0,
|
offload_func_kq(Q);
|
||||||
ggml_cpy(ctx0,
|
|
||||||
Qcur,
|
|
||||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
|
||||||
0, 2, 1, 3);
|
|
||||||
ggml_set_name(Q, "Q");
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
struct ggml_tensor * K =
|
struct ggml_tensor * K =
|
||||||
|
@ -4752,23 +4800,28 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
ggml_element_size(kv_self.k)*n_embd_gqa,
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||||
ggml_element_size(kv_self.k)*n_embd_head,
|
ggml_element_size(kv_self.k)*n_embd_head,
|
||||||
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||||
|
offload_func_kq(K);
|
||||||
ggml_set_name(K, "K");
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
// K * Q
|
// K * Q
|
||||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
offload_func_kq(KQ);
|
||||||
ggml_set_name(KQ, "KQ");
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd_head)
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||||
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
||||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
|
offload_func_kq(KQ_scaled);
|
||||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
||||||
|
offload_func_kq(KQ_masked);
|
||||||
ggml_set_name(KQ_masked, "KQ_masked");
|
ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
// KQ = soft_max(KQ_masked)
|
// KQ = soft_max(KQ_masked)
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
offload_func_v(KQ_soft_max);
|
||||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
// split cached V into n_head heads
|
// split cached V into n_head heads
|
||||||
|
@ -4781,22 +4834,25 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
ggml_set_name(V, "V");
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
offload_func_v(KQV);
|
||||||
ggml_set_name(KQV, "KQV");
|
ggml_set_name(KQV, "KQV");
|
||||||
|
|
||||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
offload_func_v(KQV_merged);
|
||||||
ggml_set_name(KQV_merged, "KQV_merged");
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
|
||||||
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
||||||
|
offload_func_v(cur);
|
||||||
ggml_set_name(cur, "KQV_merged_contiguous");
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Projection
|
// Projection
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
||||||
|
offload_func(cur);
|
||||||
|
|
||||||
// Add the input
|
// Add the input
|
||||||
cur = ggml_add(ctx0, cur, inpL);
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
offload_func(cur);
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = cur;
|
struct ggml_tensor * inpFF = cur;
|
||||||
|
|
||||||
|
@ -4805,27 +4861,36 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||||
// Norm
|
// Norm
|
||||||
{
|
{
|
||||||
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
||||||
|
offload_func_nr(cur);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
||||||
|
offload_func_nr(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
||||||
|
offload_func(cur);
|
||||||
|
|
||||||
// GELU activation
|
// GELU activation
|
||||||
cur = ggml_gelu(ctx0, cur);
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
offload_func(cur);
|
||||||
|
|
||||||
// Projection
|
// Projection
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
||||||
|
offload_func(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
inpL = ggml_add(ctx0, cur, inpFF);
|
inpL = ggml_add(ctx0, cur, inpFF);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output Norm
|
// Output Norm
|
||||||
{
|
{
|
||||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||||
|
offload_func_nr(cur);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
||||||
|
ggml_set_name(cur, "result_norm");
|
||||||
}
|
}
|
||||||
ggml_set_name(cur, "result_norm");
|
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
ggml_set_name(cur, "result_output");
|
ggml_set_name(cur, "result_output");
|
||||||
|
@ -5959,8 +6024,6 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
|
||||||
|
|
||||||
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
||||||
if (!lctx.embedding.empty()) {
|
if (!lctx.embedding.empty()) {
|
||||||
embeddings->backend = GGML_BACKEND_CPU;
|
embeddings->backend = GGML_BACKEND_CPU;
|
||||||
|
@ -6011,11 +6074,20 @@ static int llama_decode_internal(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
lctx.kv_self.has_shift = false;
|
{
|
||||||
lctx.kv_self.head += n_tokens;
|
if (kv_self.has_shift) {
|
||||||
// Ensure kv cache head points to a valid index.
|
kv_self.has_shift = false;
|
||||||
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||||
lctx.kv_self.head = 0;
|
kv_self.cells[i].delta = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv_self.head += n_tokens;
|
||||||
|
|
||||||
|
// Ensure kv cache head points to a valid index.
|
||||||
|
if (kv_self.head >= kv_self.size) {
|
||||||
|
kv_self.head = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
|
@ -7296,6 +7368,32 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
||||||
|
if (p <= 0.0f || !candidates->size) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sample_softmax(ctx, candidates);
|
||||||
|
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
float scale = candidates->data[0].p; // scale by max prob
|
||||||
|
size_t i = 1; // first token always matches
|
||||||
|
|
||||||
|
for (; i < candidates->size; ++i) {
|
||||||
|
if (candidates->data[i].p < p * scale && i >= min_keep) {
|
||||||
|
break; // prob too small
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resize the output vector to keep only the matching tokens
|
||||||
|
candidates->size = i;
|
||||||
|
|
||||||
|
if (ctx) {
|
||||||
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
||||||
if (z >= 1.0f || candidates->size <= 2) {
|
if (z >= 1.0f || candidates->size <= 2) {
|
||||||
return;
|
return;
|
||||||
|
@ -7985,6 +8083,24 @@ struct no_init {
|
||||||
no_init() { /* do nothing */ }
|
no_init() { /* do nothing */ }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct quantize_state_internal {
|
||||||
|
const llama_model & model;
|
||||||
|
const llama_model_quantize_params * params;
|
||||||
|
|
||||||
|
int n_attention_wv = 0;
|
||||||
|
int n_feed_forward_w2 = 0;
|
||||||
|
int i_attention_wv = 0;
|
||||||
|
int i_feed_forward_w2 = 0;
|
||||||
|
|
||||||
|
int n_k_quantized = 0;
|
||||||
|
int n_fallback = 0;
|
||||||
|
|
||||||
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
||||||
|
: model(model)
|
||||||
|
, params(params)
|
||||||
|
{}
|
||||||
|
};
|
||||||
|
|
||||||
static void llama_convert_tensor_internal(
|
static void llama_convert_tensor_internal(
|
||||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||||
const size_t nelements, const int nthread
|
const size_t nelements, const int nthread
|
||||||
|
@ -8043,14 +8159,14 @@ static void llama_convert_tensor_internal(
|
||||||
workers.clear();
|
workers.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
static ggml_type get_k_quant_type(
|
static ggml_type get_k_quant_type(
|
||||||
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
quantize_state_internal & qs,
|
||||||
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
||||||
) {
|
) {
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
const auto tn = LLM_TN(model.arch);
|
const llm_arch arch = qs.model.arch;
|
||||||
|
const auto tn = LLM_TN(arch);
|
||||||
|
|
||||||
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
||||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||||
|
@ -8058,7 +8174,7 @@ static ggml_type get_k_quant_type(
|
||||||
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
}
|
}
|
||||||
else if (new_type != GGML_TYPE_Q8_0) {
|
else if (new_type != GGML_TYPE_Q8_0) {
|
||||||
|
@ -8067,46 +8183,46 @@ static ggml_type get_k_quant_type(
|
||||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||||
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
if (model.type == MODEL_70B) {
|
if (qs.model.type == MODEL_70B) {
|
||||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
++*i_attention_wv;
|
++qs.i_attention_wv;
|
||||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||||
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||||
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||||
: GGML_TYPE_Q3_K;
|
: GGML_TYPE_Q3_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||||
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||||
if (model.arch == LLM_ARCH_FALCON) {
|
if (arch == LLM_ARCH_FALCON) {
|
||||||
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||||
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||||
} else {
|
} else {
|
||||||
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
|
||||||
new_type = GGML_TYPE_Q5_K;
|
new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
++*i_feed_forward_w2;
|
++qs.i_feed_forward_w2;
|
||||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
if (model.arch != LLM_ARCH_FALCON) {
|
if (arch != LLM_ARCH_FALCON) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
@ -8133,25 +8249,27 @@ static ggml_type get_k_quant_type(
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K != 0) {
|
if (nx % QK_K != 0) {
|
||||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
||||||
convert_incompatible_tensor = true;
|
convert_incompatible_tensor = true;
|
||||||
|
} else {
|
||||||
|
++qs.n_k_quantized;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
switch (new_type) {
|
||||||
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||||
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||||
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||||
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
||||||
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
||||||
} else {
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
||||||
throw std::runtime_error("Unsupported tensor size encountered\n");
|
|
||||||
}
|
}
|
||||||
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
||||||
|
++qs.n_fallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_type;
|
return new_type;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||||
ggml_type quantized_type;
|
ggml_type quantized_type;
|
||||||
|
@ -8166,7 +8284,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||||
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
// K-quants
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||||
|
@ -8177,7 +8294,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
#endif
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8204,6 +8321,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
llm_load_arch(ml, model);
|
llm_load_arch(ml, model);
|
||||||
llm_load_hparams(ml, model);
|
llm_load_hparams(ml, model);
|
||||||
|
|
||||||
|
struct quantize_state_internal qs(model, params);
|
||||||
|
|
||||||
if (params->only_copy) {
|
if (params->only_copy) {
|
||||||
ftype = model.ftype;
|
ftype = model.ftype;
|
||||||
}
|
}
|
||||||
|
@ -8216,10 +8335,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
int n_attention_wv = 0;
|
|
||||||
int n_feed_forward_w2 = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||||
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||||
|
|
||||||
|
@ -8227,21 +8342,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
||||||
++n_attention_wv;
|
++qs.n_attention_wv;
|
||||||
}
|
}
|
||||||
else if (name.find("ffn_down.weight") != std::string::npos) {
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
++n_feed_forward_w2;
|
++qs.n_feed_forward_w2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
|
||||||
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
||||||
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
|
||||||
}
|
}
|
||||||
|
|
||||||
int i_attention_wv = 0;
|
|
||||||
int i_feed_forward_w2 = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||||
|
@ -8305,11 +8416,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
if (quantize) {
|
if (quantize) {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
if (!params->pure) {
|
||||||
new_type = get_k_quant_type(
|
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
|
||||||
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
}
|
||||||
);
|
|
||||||
#endif
|
|
||||||
// If we've decided to quantize to the same type the tensor is already
|
// If we've decided to quantize to the same type the tensor is already
|
||||||
// in then there's nothing to do.
|
// in then there's nothing to do.
|
||||||
quantize = tensor->type != new_type;
|
quantize = tensor->type != new_type;
|
||||||
|
@ -8434,6 +8544,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (qs.n_fallback > 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
|
||||||
|
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int llama_apply_lora_from_file_internal(
|
static int llama_apply_lora_from_file_internal(
|
||||||
|
@ -8758,6 +8873,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
/*.allow_requantize =*/ false,
|
/*.allow_requantize =*/ false,
|
||||||
/*.quantize_output_tensor =*/ true,
|
/*.quantize_output_tensor =*/ true,
|
||||||
/*.only_copy =*/ false,
|
/*.only_copy =*/ false,
|
||||||
|
/*.pure =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -9118,8 +9234,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||||
return ctx->kv_self.head;
|
return ctx->kv_self.head;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
||||||
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
llama_kv_cache_clear(ctx->kv_self);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||||
|
@ -9565,7 +9681,7 @@ int llama_eval(
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
int n_past) {
|
int n_past) {
|
||||||
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||||
|
|
||||||
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
|
@ -9580,7 +9696,7 @@ int llama_eval_embd(
|
||||||
float * embd,
|
float * embd,
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
int n_past) {
|
int n_past) {
|
||||||
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||||
|
|
||||||
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||||
|
|
||||||
|
|
26
llama.h
26
llama.h
|
@ -178,7 +178,7 @@ extern "C" {
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
@ -191,6 +191,7 @@ extern "C" {
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
// grammar types
|
// grammar types
|
||||||
|
@ -333,17 +334,14 @@ extern "C" {
|
||||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||||
|
|
||||||
// Remove all tokens data of cells in [c0, c1)
|
// Clear the KV cache
|
||||||
// c0 < 0 : [0, c1]
|
LLAMA_API void llama_kv_cache_clear(
|
||||||
// c1 < 0 : [c0, inf)
|
struct llama_context * ctx);
|
||||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
int32_t c0,
|
|
||||||
int32_t c1);
|
|
||||||
|
|
||||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
// p0 < 0 : [0, p1]
|
// seq_id < 0 : match any sequence
|
||||||
// p1 < 0 : [p0, inf)
|
// p0 < 0 : [0, p1]
|
||||||
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_rm(
|
LLAMA_API void llama_kv_cache_seq_rm(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_seq_id seq_id,
|
llama_seq_id seq_id,
|
||||||
|
@ -600,6 +598,13 @@ extern "C" {
|
||||||
float p,
|
float p,
|
||||||
size_t min_keep);
|
size_t min_keep);
|
||||||
|
|
||||||
|
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||||
|
LLAMA_API void llama_sample_min_p(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates,
|
||||||
|
float p,
|
||||||
|
size_t min_keep);
|
||||||
|
|
||||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||||
LLAMA_API void llama_sample_tail_free(
|
LLAMA_API void llama_sample_tail_free(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
@ -658,6 +663,7 @@ extern "C" {
|
||||||
float * mu);
|
float * mu);
|
||||||
|
|
||||||
/// @details Selects the token with the highest probability.
|
/// @details Selects the token with the highest probability.
|
||||||
|
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
||||||
LLAMA_API llama_token llama_sample_token_greedy(
|
LLAMA_API llama_token llama_sample_token_greedy(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates);
|
llama_token_data_array * candidates);
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#undef NDEBUG
|
#undef NDEBUG
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#if !defined(__riscv) && !defined(__s390__)
|
#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
|
@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||||
|
|
||||||
|
// deprecated - skip
|
||||||
|
if (qfns.blck_size == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||||
|
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns.from_float && qfns.to_float) {
|
||||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||||
const float max_quantization_error =
|
const float max_quantization_error =
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue