Merge 'origin/master' into hipblas
This commit is contained in:
commit
85f902d5c4
15 changed files with 499 additions and 257 deletions
|
@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA
|
|||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
|
@ -227,6 +228,11 @@ if (LLAMA_METAL)
|
|||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
|
@ -428,11 +434,10 @@ endif()
|
|||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
ggml.h
|
||||
ggml-quants-k.h
|
||||
ggml-quants-k.c
|
||||
${GGML_SOURCES_CUDA}
|
||||
${GGML_SOURCES_OPENCL}
|
||||
${GGML_SOURCES_METAL}
|
||||
${GGML_SOURCES_EXTRA}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC .)
|
||||
|
|
37
Makefile
37
Makefile
|
@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
CFLAGS += -DGGML_USE_K_QUANTS
|
||||
OBJS += k_quants.o
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac M1 - include Accelerate framework.
|
||||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||
|
@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS
|
|||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
|
@ -230,6 +235,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
|||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
#
|
||||
# Print build information
|
||||
#
|
||||
|
@ -249,10 +259,7 @@ $(info )
|
|||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
||||
|
@ -261,7 +268,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
|||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
|
||||
libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
|
@ -271,28 +278,28 @@ clean:
|
|||
# Examples
|
||||
#
|
||||
|
||||
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
|
@ -307,11 +314,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
|||
# Tests
|
||||
#
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
./$@
|
||||
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
.PHONY: tests clean
|
||||
|
|
|
@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||
|
||||
**Hot topics:**
|
||||
|
||||
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
|
||||
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
|
||||
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
|
||||
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
|
||||
|
|
|
@ -132,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
params.path_prompt_cache = argv[i];
|
||||
} else if (arg == "--prompt-cache-all") {
|
||||
params.prompt_cache_all = true;
|
||||
} else if (arg == "--prompt-cache-ro") {
|
||||
params.prompt_cache_ro = true;
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -432,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
||||
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
||||
fprintf(stderr, " not supported with --interactive or other interactive options\n");
|
||||
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||
|
|
|
@ -62,6 +62,7 @@ struct gpt_params {
|
|||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
|
|
|
@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
|
|||
const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if (!path_session.empty() && need_to_save_session) {
|
||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||
need_to_save_session = false;
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!path_session.empty() && params.prompt_cache_all) {
|
||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
|
30
flake.lock
generated
30
flake.lock
generated
|
@ -1,12 +1,15 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1676283394,
|
||||
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
|
||||
"lastModified": 1685518550,
|
||||
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
|
||||
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -17,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1678470307,
|
||||
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
|
||||
"lastModified": 1685931219,
|
||||
"narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
|
||||
"rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -36,6 +39,21 @@
|
|||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
|
26
flake.nix
26
flake.nix
|
@ -6,6 +6,13 @@
|
|||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||
inherit (pkgs.lib) optionals;
|
||||
isM1 = isAarch64 && isDarwin;
|
||||
osSpecific =
|
||||
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
|
||||
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
|
||||
else [ ];
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
};
|
||||
|
@ -18,17 +25,22 @@
|
|||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch =
|
||||
if isM1 then ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
|
||||
'' else "";
|
||||
nativeBuildInputs = with pkgs; [ cmake ];
|
||||
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
|
||||
darwin.apple_sdk.frameworks.Accelerate
|
||||
];
|
||||
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
];
|
||||
"-DLLAMA_METAL=ON"
|
||||
]);
|
||||
installPhase = ''
|
||||
mkdir -p $out/bin
|
||||
mv bin/* $out/bin/
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
||||
cat ${./convert.py} >> $out/bin/convert.py
|
||||
|
@ -40,9 +52,7 @@
|
|||
packages = with pkgs; [
|
||||
cmake
|
||||
llama-python
|
||||
] ++ lib.optionals stdenv.isDarwin [
|
||||
darwin.apple_sdk.frameworks.Accelerate
|
||||
];
|
||||
] ++ osSpecific;
|
||||
};
|
||||
}
|
||||
);
|
||||
|
|
120
ggml-cuda.cu
120
ggml-cuda.cu
|
@ -164,24 +164,24 @@ typedef struct {
|
|||
uint8_t qs[QK_K/4]; // quants
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
} block_q2_k;
|
||||
static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8];
|
||||
uint8_t qs[QK_K/4]; // nibbles / quants
|
||||
uint8_t scales[3*QK_K/64];
|
||||
half d;
|
||||
} block_q3_k;
|
||||
static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_k;
|
||||
static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
|
@ -189,16 +189,16 @@ typedef struct {
|
|||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_k;
|
||||
static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
|
||||
typedef struct {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales
|
||||
half d; // delta
|
||||
} block_q6_k;
|
||||
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
|
||||
} block_q6_K;
|
||||
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
|
@ -353,7 +353,7 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int
|
|||
|
||||
//================================== k-quants
|
||||
|
||||
static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {
|
||||
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
|
@ -361,7 +361,7 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {
|
|||
const int l = tid - 32*n;
|
||||
const int is = 8*n + l/16;
|
||||
|
||||
const block_q2_k * x = (const block_q2_k *) vx;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const uint8_t q = x[i].qs[32*n + l];
|
||||
float * y = yy + i*QK_K + 128*n;
|
||||
|
@ -375,9 +375,9 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) {
|
|||
|
||||
}
|
||||
|
||||
static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
|
||||
const block_q2_k * x = (const block_q2_k *) vx;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
// if n is 0, we want to do the lower 128, else the upper 128,
|
||||
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
||||
|
@ -406,7 +406,7 @@ static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs
|
|||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {
|
||||
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||
|
||||
int r = threadIdx.x/4;
|
||||
int i = blockIdx.x;
|
||||
|
@ -416,7 +416,7 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {
|
|||
int n = tid / 4;
|
||||
int j = tid - 4*n;
|
||||
|
||||
const block_q3_k * x = (const block_q3_k *) vx;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
uint8_t m = 1 << (4*n + j);
|
||||
int is = 8*n + 2*j + is0;
|
||||
|
@ -437,9 +437,9 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) {
|
|||
|
||||
}
|
||||
|
||||
static __device__ void vec_dot_q3_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
|
||||
const block_q3_k * x = (const block_q3_k *) vx;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
const uint32_t kmask1 = 0x03030303;
|
||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||
|
@ -491,8 +491,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
|||
}
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q4_k(const void * vx, float * yy) {
|
||||
const block_q4_k * x = (const block_q4_k *) vx;
|
||||
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
|
@ -528,9 +528,9 @@ static __global__ void dequantize_block_q4_k(const void * vx, float * yy) {
|
|||
}
|
||||
}
|
||||
|
||||
static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
|
||||
const block_q4_k * x = (const block_q4_k *) vx;
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
// iqs is in 0...248 in steps of 8 =>
|
||||
const int j = iqs / 64; // j is in 0...3
|
||||
|
@ -560,8 +560,8 @@ static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs
|
|||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q5_k(const void * vx, float * yy) {
|
||||
const block_q5_k * x = (const block_q5_k *) vx;
|
||||
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
|
@ -593,9 +593,9 @@ static __global__ void dequantize_block_q5_k(const void * vx, float * yy) {
|
|||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
}
|
||||
|
||||
static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
|
||||
const block_q5_k * x = (const block_q5_k *) vx;
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
// iqs is in 0...248 in steps of 8 =>
|
||||
const int j = iqs / 64; // j is in 0...3
|
||||
|
@ -630,8 +630,8 @@ static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs
|
|||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q6_k(const void * vx, float * yy) {
|
||||
const block_q6_k * x = (const block_q6_k *) vx;
|
||||
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
|
@ -655,9 +655,9 @@ static __global__ void dequantize_block_q6_k(const void * vx, float * yy) {
|
|||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
|
||||
static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||
|
||||
const block_q6_k * x = (const block_q6_k *) vx;
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int ip = iqs / 128; // 0 or 1
|
||||
const int il = (iqs - 128*ip)/8; // 0...15
|
||||
|
@ -858,29 +858,29 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
|
|||
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
||||
static void dequantize_row_q2_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q2_k<<<nb, 64, 0, stream>>>(vx, y);
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
static void dequantize_row_q3_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q3_k<<<nb, 64, 0, stream>>>(vx, y);
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
static void dequantize_row_q4_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q4_k<<<nb, 32, 0, stream>>>(vx, y);
|
||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
static void dequantize_row_q5_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q5_k<<<nb, 64, 0, stream>>>(vx, y);
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
static void dequantize_row_q6_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_q6_k<<<nb, 64, 0, stream>>>(vx, y);
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
|
@ -923,35 +923,35 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|||
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q2_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2;
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q2_k><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q3_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const dim3 block_dims(32, 2, 1);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q3_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const dim3 block_dims(32, 2, 1);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q4_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const dim3 block_dims(32, 2, 1);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q5_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const dim3 block_dims(32, 2, 1);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q6_k><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
|
@ -980,15 +980,15 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_row_q8_0_cuda;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_k_cuda;
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_k_cuda;
|
||||
return dequantize_row_q3_K_cuda;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_k_cuda;
|
||||
return dequantize_row_q4_K_cuda;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_k_cuda;
|
||||
return dequantize_row_q5_K_cuda;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return dequantize_row_q6_k_cuda;
|
||||
return dequantize_row_q6_K_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_fp16_to_fp32_cuda;
|
||||
default:
|
||||
|
@ -1331,19 +1331,19 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|||
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
|
|
23
ggml-metal.m
23
ggml-metal.m
|
@ -49,9 +49,11 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_k);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||
|
@ -133,9 +135,11 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_k);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||
|
@ -517,7 +521,20 @@ void ggml_metal_graph_compute(
|
|||
nth1 = 4;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
||||
} break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "Asserting on type %d\n",(int)src0t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -540,6 +557,9 @@ void ggml_metal_graph_compute(
|
|||
if (src0t == GGML_TYPE_Q4_0) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
|
@ -555,6 +575,7 @@ void ggml_metal_graph_compute(
|
|||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
|
|
162
ggml-metal.metal
162
ggml-metal.metal
|
@ -503,3 +503,165 @@ kernel void kernel_cpy_f32_f32(
|
|||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
//============================================ k-quants ======================================================
|
||||
|
||||
#define QK_K 256
|
||||
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_k;
|
||||
|
||||
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||
uchar4 r;
|
||||
if (j < 4) {
|
||||
r[0] = q[j+0] & 63; r[1] = q[j+4] & 63;
|
||||
r[2] = q[j+1] & 63; r[3] = q[j+5] & 63;
|
||||
} else {
|
||||
r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
||||
r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4);
|
||||
r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = x[i].d;
|
||||
const float min = x[i].dmin;
|
||||
|
||||
device const uint8_t * q = x[i].qs;
|
||||
device const uint8_t * scales = x[i].scales;
|
||||
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K; j += 64) {
|
||||
const uchar4 sc = get_scale_min_k4(is, scales);
|
||||
const float d1 = d * sc[0]; const float m1 = min * sc[1];
|
||||
const float d2 = d * sc[2]; const float m2 = min * sc[3];
|
||||
for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
|
||||
for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
|
||||
q += 32; is += 2;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q4_k(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb1,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
dequantize_row_q4_k(
|
||||
(device const block_q4_k *) ((device char *) src0 + r*nb01),
|
||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_k_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint2 tpig[[thread_position_in_grid]], // we don't use this for now
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
const uint nth = tptg.x*tptg.y;
|
||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int tid = tpitg.y; // 0...16
|
||||
const int il = tid/4; // 0...3
|
||||
const int ir = tid%4; // 0...3
|
||||
const int n = 8;
|
||||
const int is = 2*il;
|
||||
|
||||
sum[ith] = 0.0f;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||
|
||||
device const uint8_t * q = (x + i)->qs + 32*il + n*ir;
|
||||
device const float * y = yy + i*QK_K + 64*il + n*ir;
|
||||
device const uint8_t * scales = (x + i)->scales;
|
||||
|
||||
const float dall = (float)((x + i)->d);
|
||||
const float dmin = (float)((x + i)->dmin);
|
||||
|
||||
const uchar4 sc = get_scale_min_k4(is, scales);
|
||||
|
||||
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int l = 0; l < n; ++l) {
|
||||
s[0] += y[l+ 0] * (q[l] & 0xF); s[1] += y[l+ 0];
|
||||
s[2] += y[l+32] * (q[l] >> 4); s[3] += y[l+32];
|
||||
}
|
||||
sumf += dall * (s[0] * sc[0] + s[2] * sc[2]) - dmin * (s[1] * sc[1] + s[3] * sc[3]);
|
||||
|
||||
}
|
||||
sum[ith] = sumf;
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
// This version is slightly faster than the commented out one below,
|
||||
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%16 == 0) {
|
||||
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
}
|
||||
|
||||
//// accumulate the sum from all threads in the threadgroup
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//for (uint i = nth/2; i > 0; i /= 2) {
|
||||
// if (ith < i) {
|
||||
// sum[ith] += sum[ith + i];
|
||||
// }
|
||||
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//}
|
||||
|
||||
//if (ith == 0) {
|
||||
// dst[r1*ne0 + r0] = sum[0];
|
||||
//}
|
||||
}
|
||||
|
|
121
ggml.c
121
ggml.c
|
@ -2,7 +2,10 @@
|
|||
#define _GNU_SOURCE
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-quants-k.h"
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
#include "k_quants.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
|
@ -1580,46 +1583,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_q = NULL, // TODO
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
},
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_k,
|
||||
.quantize_row_q = quantize_row_q2_k,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_k,
|
||||
.vec_dot_q = ggml_vec_dot_q2_k_q8_k,
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
||||
.quantize_row_q = quantize_row_q2_K,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_K,
|
||||
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_Q3_K] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_k,
|
||||
.quantize_row_q = quantize_row_q3_k,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_k,
|
||||
.vec_dot_q = ggml_vec_dot_q3_k_q8_k,
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
||||
.quantize_row_q = quantize_row_q3_K,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_K,
|
||||
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_Q4_K] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_k,
|
||||
.quantize_row_q = quantize_row_q4_k,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_k,
|
||||
.vec_dot_q = ggml_vec_dot_q4_k_q8_k,
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
|
||||
.quantize_row_q = quantize_row_q4_K,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_K,
|
||||
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_Q5_K] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_k,
|
||||
.quantize_row_q = quantize_row_q5_k,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_k,
|
||||
.vec_dot_q = ggml_vec_dot_q5_k_q8_k,
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
||||
.quantize_row_q = quantize_row_q5_K,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_K,
|
||||
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_Q6_K] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_k,
|
||||
.quantize_row_q = quantize_row_q6_k,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_k,
|
||||
.vec_dot_q = ggml_vec_dot_q6_k_q8_k,
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
||||
.quantize_row_q = quantize_row_q6_K,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_K,
|
||||
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
#endif
|
||||
};
|
||||
|
||||
// For internal test use
|
||||
|
@ -3499,12 +3504,14 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|||
[GGML_TYPE_Q5_1] = QK5_1,
|
||||
[GGML_TYPE_Q8_0] = QK8_0,
|
||||
[GGML_TYPE_Q8_1] = QK8_1,
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
[GGML_TYPE_Q2_K] = QK_K,
|
||||
[GGML_TYPE_Q3_K] = QK_K,
|
||||
[GGML_TYPE_Q4_K] = QK_K,
|
||||
[GGML_TYPE_Q5_K] = QK_K,
|
||||
[GGML_TYPE_Q6_K] = QK_K,
|
||||
[GGML_TYPE_Q8_K] = QK_K,
|
||||
#endif
|
||||
[GGML_TYPE_I8] = 1,
|
||||
[GGML_TYPE_I16] = 1,
|
||||
[GGML_TYPE_I32] = 1,
|
||||
|
@ -3520,12 +3527,14 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|||
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
||||
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
||||
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
||||
[GGML_TYPE_Q2_K] = sizeof(block_q2_k),
|
||||
[GGML_TYPE_Q3_K] = sizeof(block_q3_k),
|
||||
[GGML_TYPE_Q4_K] = sizeof(block_q4_k),
|
||||
[GGML_TYPE_Q5_K] = sizeof(block_q5_k),
|
||||
[GGML_TYPE_Q6_K] = sizeof(block_q6_k),
|
||||
[GGML_TYPE_Q8_K] = sizeof(block_q8_k),
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
||||
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
||||
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
||||
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
||||
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
||||
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
||||
#endif
|
||||
[GGML_TYPE_I8] = sizeof(int8_t),
|
||||
[GGML_TYPE_I16] = sizeof(int16_t),
|
||||
[GGML_TYPE_I32] = sizeof(int32_t),
|
||||
|
@ -3542,12 +3551,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|||
[GGML_TYPE_Q5_1] = "q5_1",
|
||||
[GGML_TYPE_Q8_0] = "q8_0",
|
||||
[GGML_TYPE_Q8_1] = "q8_1",
|
||||
[GGML_TYPE_Q2_K] = "q2_k",
|
||||
[GGML_TYPE_Q3_K] = "q3_k",
|
||||
[GGML_TYPE_Q4_K] = "q4_k",
|
||||
[GGML_TYPE_Q5_K] = "q5_k",
|
||||
[GGML_TYPE_Q6_K] = "q6_k",
|
||||
[GGML_TYPE_Q8_K] = "q8_k",
|
||||
[GGML_TYPE_Q2_K] = "q2_K",
|
||||
[GGML_TYPE_Q3_K] = "q3_K",
|
||||
[GGML_TYPE_Q4_K] = "q4_K",
|
||||
[GGML_TYPE_Q5_K] = "q5_K",
|
||||
[GGML_TYPE_Q6_K] = "q6_K",
|
||||
[GGML_TYPE_Q8_K] = "q8_K",
|
||||
[GGML_TYPE_I8] = "i8",
|
||||
[GGML_TYPE_I16] = "i16",
|
||||
[GGML_TYPE_I32] = "i32",
|
||||
|
@ -14720,7 +14729,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||
fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
tensor->n_dims,
|
||||
|
@ -14734,7 +14743,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||
arg,
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
|
@ -14763,11 +14772,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|||
FILE * fout = stdout;
|
||||
|
||||
fprintf(fout, "\n");
|
||||
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
||||
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
||||
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
||||
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
||||
fprintf(fout, "%-16s %8ju\n", "eval", size_eval);
|
||||
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
||||
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
||||
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
||||
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
||||
fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
|
||||
|
||||
// header
|
||||
fprintf(fout, "\n");
|
||||
|
@ -16249,36 +16258,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
||||
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
||||
} break;
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_q2_k * block = (block_q2_k*)dst + start / QK_K;
|
||||
result = ggml_quantize_q2_k(src + start, block, n, n, hist);
|
||||
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
||||
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_q3_k * block = (block_q3_k*)dst + start / QK_K;
|
||||
result = ggml_quantize_q3_k(src + start, block, n, n, hist);
|
||||
block_q3_K * block = (block_q3_K*)dst + start / QK_K;
|
||||
result = ggml_quantize_q3_K(src + start, block, n, n, hist);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_q4_k * block = (block_q4_k*)dst + start / QK_K;
|
||||
result = ggml_quantize_q4_k(src + start, block, n, n, hist);
|
||||
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
||||
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_q5_k * block = (block_q5_k*)dst + start / QK_K;
|
||||
result = ggml_quantize_q5_k(src + start, block, n, n, hist);
|
||||
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
||||
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_q6_k * block = (block_q6_k*)dst + start / QK_K;
|
||||
result = ggml_quantize_q6_k(src + start, block, n, n, hist);
|
||||
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
||||
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include "ggml-quants-k.h"
|
||||
#include "k_quants.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <math.h>
|
||||
|
@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|||
|
||||
//========================- 2-bit (de)-quantization
|
||||
|
||||
void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) {
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -341,7 +341,7 @@ void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -374,26 +374,26 @@ void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q2_k_reference(x, vy, k);
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q2_K_reference(x, vy, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
const int nb = k / QK_K;
|
||||
|
||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||
(void)hist;
|
||||
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K;
|
||||
quantize_row_q2_k_reference(src + j, y, k);
|
||||
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
||||
quantize_row_q2_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q2_k));
|
||||
return (n/QK_K*sizeof(block_q2_K));
|
||||
}
|
||||
|
||||
//========================= 3-bit (de)-quantization
|
||||
|
||||
void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) {
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -469,7 +469,7 @@ void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
assert(QK_K == 256);
|
||||
const int nb = k / QK_K;
|
||||
|
@ -520,26 +520,26 @@ void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q3_k_reference(x, vy, k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
||||
quantize_row_q3_K_reference(x, vy, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
const int nb = k / QK_K;
|
||||
|
||||
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||
(void)hist;
|
||||
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K;
|
||||
quantize_row_q3_k_reference(src + j, y, k);
|
||||
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
||||
quantize_row_q3_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q3_k));
|
||||
return (n/QK_K*sizeof(block_q3_K));
|
||||
}
|
||||
|
||||
// ====================== 4-bit (de)-quantization
|
||||
|
||||
void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) {
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -604,7 +604,7 @@ void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -630,26 +630,26 @@ void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) {
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q4_k * restrict y = vy;
|
||||
quantize_row_q4_k_reference(x, y, k);
|
||||
block_q4_K * restrict y = vy;
|
||||
quantize_row_q4_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
(void)hist; // TODO: collect histograms
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K;
|
||||
quantize_row_q4_k_reference(src + j, y, k);
|
||||
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
||||
quantize_row_q4_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q4_k));
|
||||
return (n/QK_K*sizeof(block_q4_K));
|
||||
}
|
||||
|
||||
// ====================== 5-bit (de)-quantization
|
||||
|
||||
void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) {
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -731,7 +731,7 @@ void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -759,26 +759,26 @@ void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) {
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q5_k * restrict y = vy;
|
||||
quantize_row_q5_k_reference(x, y, k);
|
||||
block_q5_K * restrict y = vy;
|
||||
quantize_row_q5_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
(void)hist;
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K;
|
||||
quantize_row_q5_k_reference(src + j, y, k);
|
||||
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
||||
quantize_row_q5_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q5_k));
|
||||
return (n/QK_K*sizeof(block_q5_K));
|
||||
}
|
||||
|
||||
// ====================== 6-bit (de)-quantization
|
||||
|
||||
void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) {
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -842,7 +842,7 @@ void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -875,28 +875,28 @@ void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) {
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q6_k * restrict y = vy;
|
||||
quantize_row_q6_k_reference(x, y, k);
|
||||
block_q6_K * restrict y = vy;
|
||||
quantize_row_q6_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
(void)hist; // TODO
|
||||
|
||||
for (int j = 0; j < nb; j += k) {
|
||||
block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K;
|
||||
quantize_row_q6_k_reference(src + j, y, k);
|
||||
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
||||
quantize_row_q6_K_reference(src + j, y, k);
|
||||
}
|
||||
return (n/QK_K*sizeof(block_q6_k));
|
||||
return (n/QK_K*sizeof(block_q6_K));
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) {
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -933,7 +933,7 @@ void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) {
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
@ -944,8 +944,8 @@ void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) {
|
||||
quantize_row_q8_k_reference(x, y, k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
|
||||
quantize_row_q8_K_reference(x, y, k);
|
||||
}
|
||||
|
||||
//===================================== Dot ptoducts =================================
|
||||
|
@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) {
|
|||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
|
||||
const block_q2_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q2_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
|
@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restri
|
|||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const uint32_t kmask1 = 0x03030303;
|
||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||
|
||||
const block_q3_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q3_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
|
@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restri
|
|||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_q4_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q4_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
|
@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restri
|
|||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_q5_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q5_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
|
@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restri
|
|||
|
||||
|
||||
|
||||
void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_q6_k * restrict x = vx;
|
||||
const block_q8_k * restrict y = vy;
|
||||
const block_q6_K * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
|
@ -22,8 +22,8 @@ typedef struct {
|
|||
uint8_t qs[QK_K/4]; // quants
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
} block_q2_k;
|
||||
static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding");
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
// 3-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
|
@ -34,8 +34,8 @@ typedef struct {
|
|||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q3_k;
|
||||
static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding");
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||
|
||||
// 4-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
|
@ -46,8 +46,8 @@ typedef struct {
|
|||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_k;
|
||||
static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding");
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||
|
||||
// 5-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
|
@ -59,8 +59,8 @@ typedef struct {
|
|||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_k;
|
||||
static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding");
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
|
@ -71,52 +71,52 @@ typedef struct {
|
|||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q6_k;
|
||||
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding");
|
||||
} block_q6_K;
|
||||
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
||||
|
||||
// This is only used for intermediate quantization and dot products
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK_K]; // quants
|
||||
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
||||
} block_q8_k;
|
||||
static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding");
|
||||
} block_q8_K;
|
||||
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k);
|
||||
void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k);
|
||||
void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k);
|
||||
void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k);
|
||||
void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k);
|
||||
void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k);
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_k(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
// Quantization with histogram collection
|
||||
size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
13
llama.cpp
13
llama.cpp
|
@ -1059,17 +1059,18 @@ static void llama_model_load_internal(
|
|||
}
|
||||
}
|
||||
|
||||
(void) main_gpu;
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||
ggml_cuda_set_main_device(main_gpu);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
||||
#else
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
||||
#endif
|
||||
|
||||
|
@ -1152,8 +1153,7 @@ static void llama_model_load_internal(
|
|||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
(void) vram_scratch;
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
vram_scratch = n_batch * MB;
|
||||
ggml_cuda_set_scratch_size(vram_scratch);
|
||||
|
@ -1163,6 +1163,8 @@ static void llama_model_load_internal(
|
|||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
||||
|
@ -1331,6 +1333,7 @@ static bool llama_eval_internal(
|
|||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
(void) i_gpu_start;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
offload_func_t offload_func = llama_nop;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue