Merge remote-tracking branch 'origin/master' into gg/self-extend-part-2
This commit is contained in:
commit
66ad819dca
6 changed files with 72 additions and 16 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -51,6 +51,7 @@ models-mnt
|
||||||
/lookup
|
/lookup
|
||||||
/main
|
/main
|
||||||
/metal
|
/metal
|
||||||
|
/passkey
|
||||||
/perplexity
|
/perplexity
|
||||||
/q8dot
|
/q8dot
|
||||||
/quantize
|
/quantize
|
||||||
|
|
5
Makefile
5
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
@ -665,6 +665,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
||||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -118,6 +118,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||||
|
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||||
|
|
|
@ -138,6 +138,7 @@ struct cmd_params {
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
|
std::vector<bool> no_kv_offload;
|
||||||
std::vector<bool> mul_mat_q;
|
std::vector<bool> mul_mat_q;
|
||||||
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
|
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
|
||||||
int reps;
|
int reps;
|
||||||
|
@ -155,6 +156,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_num_physical_cores()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
|
/* no_kv_offload */ {false},
|
||||||
/* mul_mat_q */ {true},
|
/* mul_mat_q */ {true},
|
||||||
/* tensor_split */ {{}},
|
/* tensor_split */ {{}},
|
||||||
/* reps */ 5,
|
/* reps */ 5,
|
||||||
|
@ -176,6 +178,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||||
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
|
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
|
@ -309,6 +312,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.main_gpu = split<int>(argv[i], split_delim);
|
params.main_gpu = split<int>(argv[i], split_delim);
|
||||||
|
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = split<bool>(argv[i], split_delim);
|
||||||
|
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
||||||
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
|
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -383,6 +393,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
||||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||||
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||||
|
@ -400,6 +411,7 @@ struct cmd_params_instance {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
|
bool no_kv_offload;
|
||||||
bool mul_mat_q;
|
bool mul_mat_q;
|
||||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||||
|
|
||||||
|
@ -428,6 +440,7 @@ struct cmd_params_instance {
|
||||||
cparams.type_k = type_k;
|
cparams.type_k = type_k;
|
||||||
cparams.type_v = type_v;
|
cparams.type_v = type_v;
|
||||||
cparams.mul_mat_q = mul_mat_q;
|
cparams.mul_mat_q = mul_mat_q;
|
||||||
|
cparams.offload_kqv = !no_kv_offload;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
@ -444,6 +457,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
for (const auto & tk : params.type_k)
|
for (const auto & tk : params.type_k)
|
||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
|
for (const auto & nkvo : params.no_kv_offload)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
cmd_params_instance instance = {
|
cmd_params_instance instance = {
|
||||||
/* .model = */ m,
|
/* .model = */ m,
|
||||||
|
@ -455,6 +469,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .mul_mat_q = */ mmq,
|
/* .mul_mat_q = */ mmq,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
};
|
};
|
||||||
|
@ -476,6 +491,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
for (const auto & tk : params.type_k)
|
for (const auto & tk : params.type_k)
|
||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
|
for (const auto & nkvo : params.no_kv_offload)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
if (n_prompt == 0) {
|
if (n_prompt == 0) {
|
||||||
|
@ -491,6 +507,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .mul_mat_q = */ mmq,
|
/* .mul_mat_q = */ mmq,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
};
|
};
|
||||||
|
@ -511,6 +528,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .mul_mat_q = */ mmq,
|
/* .mul_mat_q = */ mmq,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
};
|
};
|
||||||
|
@ -559,6 +577,7 @@ struct test {
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
|
bool no_kv_offload;
|
||||||
bool mul_mat_q;
|
bool mul_mat_q;
|
||||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||||
int n_prompt;
|
int n_prompt;
|
||||||
|
@ -579,6 +598,7 @@ struct test {
|
||||||
type_v = inst.type_v;
|
type_v = inst.type_v;
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
main_gpu = inst.main_gpu;
|
main_gpu = inst.main_gpu;
|
||||||
|
no_kv_offload = inst.no_kv_offload;
|
||||||
mul_mat_q = inst.mul_mat_q;
|
mul_mat_q = inst.mul_mat_q;
|
||||||
tensor_split = inst.tensor_split;
|
tensor_split = inst.tensor_split;
|
||||||
n_prompt = inst.n_prompt;
|
n_prompt = inst.n_prompt;
|
||||||
|
@ -640,7 +660,8 @@ struct test {
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "type_k", "type_v",
|
"n_batch", "n_threads", "type_k", "type_v",
|
||||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
"n_gpu_layers", "main_gpu", "no_kv_offload",
|
||||||
|
"mul_mat_q", "tensor_split",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
"avg_ts", "stddev_ts"
|
"avg_ts", "stddev_ts"
|
||||||
|
@ -659,7 +680,7 @@ struct test {
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||||
field == "f16_kv" || field == "mul_mat_q") {
|
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
if (field == "avg_ts" || field == "stddev_ts") {
|
if (field == "avg_ts" || field == "stddev_ts") {
|
||||||
|
@ -690,7 +711,8 @@ struct test {
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload),
|
||||||
|
std::to_string(mul_mat_q), tensor_split_str,
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||||
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
||||||
|
@ -851,6 +873,9 @@ struct markdown_printer : public printer {
|
||||||
if (field == "mul_mat_q") {
|
if (field == "mul_mat_q") {
|
||||||
return "mmq";
|
return "mmq";
|
||||||
}
|
}
|
||||||
|
if (field == "no_kv_offload") {
|
||||||
|
return "nkvo";
|
||||||
|
}
|
||||||
if (field == "tensor_split") {
|
if (field == "tensor_split") {
|
||||||
return "ts";
|
return "ts";
|
||||||
}
|
}
|
||||||
|
@ -885,6 +910,9 @@ struct markdown_printer : public printer {
|
||||||
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
|
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
|
||||||
fields.push_back("mul_mat_q");
|
fields.push_back("mul_mat_q");
|
||||||
}
|
}
|
||||||
|
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
||||||
|
fields.push_back("no_kv_offload");
|
||||||
|
}
|
||||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||||
fields.push_back("tensor_split");
|
fields.push_back("tensor_split");
|
||||||
}
|
}
|
||||||
|
|
12
examples/passkey/README.md
Normal file
12
examples/passkey/README.md
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# llama.cpp/example/passkey
|
||||||
|
|
||||||
|
See the following PRs for more info:
|
||||||
|
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/3856
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/4810
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
|
||||||
|
```
|
35
ggml-cuda.cu
35
ggml-cuda.cu
|
@ -1872,14 +1872,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
||||||
v.y = x[ib + iqs + 1];
|
v.y = x[ib + iqs + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
|
||||||
const float * x = (const float *) vx;
|
|
||||||
|
|
||||||
// automatic half -> float type cast if dfloat == float
|
|
||||||
v.x = x[ib + iqs + 0];
|
|
||||||
v.y = x[ib + iqs + 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
||||||
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
@ -1983,7 +1975,7 @@ static __global__ void k_get_rows_float(
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
|
@ -2002,6 +1994,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||||
y[iybs + iqs + y_offset] = v.y;
|
y[iybs + iqs + y_offset] = v.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename src_t, typename dst_t>
|
||||||
|
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const src_t * x = (src_t *) vx;
|
||||||
|
|
||||||
|
y[i] = x[i];
|
||||||
|
}
|
||||||
|
|
||||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||||
|
|
||||||
|
@ -5609,7 +5614,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5659,6 +5664,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename src_t, typename dst_t>
|
||||||
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
|
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -5682,7 +5693,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return dequantize_row_q6_K_cuda;
|
return dequantize_row_q6_K_cuda;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return dequantize_block_cuda<1, 1, convert_f32>;
|
return convert_unary_cuda<float>;
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -5711,7 +5722,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return dequantize_row_q6_K_cuda;
|
return dequantize_row_q6_K_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return dequantize_block_cuda<1, 1, convert_f16>;
|
return convert_unary_cuda<half>;
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue