diff --git a/.gitignore b/.gitignore index def74a1e9..cf1b692e9 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,7 @@ models-mnt /lookup /main /metal +/passkey /perplexity /q8dot /quantize diff --git a/Makefile b/Makefile index 28c6d79bc..4c7e175bf 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ BUILD_TARGETS = \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -665,6 +665,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + ifdef LLAMA_METAL metal: examples/metal/metal.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index ca6d14e17..2f6e6ffee 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ as the main playground for developing new features for the [ggml](https://github - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) +- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 6617c050d..7f7186cde 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -138,6 +138,7 @@ struct cmd_params { std::vector n_threads; std::vector n_gpu_layers; std::vector main_gpu; + std::vector no_kv_offload; std::vector mul_mat_q; std::vector> tensor_split; int reps; @@ -155,6 +156,7 @@ static const cmd_params cmd_params_defaults = { /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, /* main_gpu */ {0}, + /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, /* tensor_split */ {{}}, /* reps */ 5, @@ -176,6 +178,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split \n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -309,6 +312,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.main_gpu = split(argv[i], split_delim); + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); } else if (arg == "-mmq" || arg == "--mul-mat-q") { if (++i >= argc) { invalid_param = true; @@ -383,6 +393,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -400,6 +411,7 @@ struct cmd_params_instance { int n_threads; int n_gpu_layers; int main_gpu; + bool no_kv_offload; bool mul_mat_q; std::array tensor_split; @@ -428,6 +440,7 @@ struct cmd_params_instance { cparams.type_k = type_k; cparams.type_v = type_v; cparams.mul_mat_q = mul_mat_q; + cparams.offload_kqv = !no_kv_offload; return cparams; } @@ -444,6 +457,7 @@ static std::vector get_cmd_params_instances_int(const cmd_p for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { /* .model = */ m, @@ -455,6 +469,7 @@ static std::vector get_cmd_params_instances_int(const cmd_p /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -476,6 +491,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -491,6 +507,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -511,6 +528,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -559,6 +577,7 @@ struct test { ggml_type type_v; int n_gpu_layers; int main_gpu; + bool no_kv_offload; bool mul_mat_q; std::array tensor_split; int n_prompt; @@ -579,6 +598,7 @@ struct test { type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; n_prompt = inst.n_prompt; @@ -640,7 +660,8 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", + "n_gpu_layers", "main_gpu", "no_kv_offload", + "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -659,7 +680,7 @@ struct test { return INT; } if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "mul_mat_q") { + field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -690,7 +711,8 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, + std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -851,6 +873,9 @@ struct markdown_printer : public printer { if (field == "mul_mat_q") { return "mmq"; } + if (field == "no_kv_offload") { + return "nkvo"; + } if (field == "tensor_split") { return "ts"; } @@ -885,6 +910,9 @@ struct markdown_printer : public printer { if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } + if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { + fields.push_back("no_kv_offload"); + } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.push_back("tensor_split"); } diff --git a/examples/passkey/README.md b/examples/passkey/README.md new file mode 100644 index 000000000..4a22bb559 --- /dev/null +++ b/examples/passkey/README.md @@ -0,0 +1,12 @@ +# llama.cpp/example/passkey + +See the following PRs for more info: + +- https://github.com/ggerganov/llama.cpp/pull/3856 +- https://github.com/ggerganov/llama.cpp/pull/4810 + +### Usage + +```bash +make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250 +``` diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 54b266be4..2df64b111 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1872,14 +1872,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const float * x = (const float *) vx; - - // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; -} - static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { const int ix = blockDim.x*blockIdx.x + threadIdx.x; @@ -1983,7 +1975,7 @@ static __global__ void k_get_rows_float( template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { - const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; + const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x); if (i >= k) { return; @@ -2002,6 +1994,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ y[iybs + iqs + y_offset] = v.y; } +template +static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + const src_t * x = (src_t *) vx; + + y[i] = x[i]; +} + // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q @@ -5609,7 +5614,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con template static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); dequantize_block<<>>(vx, y, k); } @@ -5659,6 +5664,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu #endif } +template +static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + convert_unary<<>>(vx, y, k); +} + static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -5682,7 +5693,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q6_K: return dequantize_row_q6_K_cuda; case GGML_TYPE_F32: - return dequantize_block_cuda<1, 1, convert_f32>; + return convert_unary_cuda; default: return nullptr; } @@ -5711,7 +5722,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q6_K: return dequantize_row_q6_K_cuda; case GGML_TYPE_F16: - return dequantize_block_cuda<1, 1, convert_f16>; + return convert_unary_cuda; default: return nullptr; }