From b4d92a59a20eea400d8dd30844a339b76210daa0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 14 Jan 2025 15:42:23 +0100 Subject: [PATCH 01/30] ci : add -no-cnv for tests (#11238) --- ci/run.sh | 66 +++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index abf08a4ff..77c32ce00 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -326,17 +326,17 @@ function gg_run_open_llama_7b_v2 { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -460,17 +460,17 @@ function gg_run_pythia_1_4b { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -591,17 +591,17 @@ function gg_run_pythia_2_8b { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log From f446c2cf6a56a750b67c967505e717a996d2f2fd Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 15 Jan 2025 08:50:17 +0530 Subject: [PATCH 02/30] SYCL: Add gated linear attention kernel (#11175) * SYCL: Add Gated Linear attention kernel * glahpp: add a space at the end of file * gla: Put the barrier inside the main logic loop --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ++ ggml/src/ggml-sycl/gla.cpp | 105 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/gla.hpp | 8 +++ 4 files changed, 118 insertions(+) create mode 100644 ggml/src/ggml-sycl/gla.cpp create mode 100644 ggml/src/ggml-sycl/gla.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 85748a5b4..b1df4e5db 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -29,5 +29,6 @@ #include "wkv6.hpp" #include "outprod.hpp" #include "element_wise.hpp" +#include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 037c8093e..5272ca454 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4040,6 +4040,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_RWKV_WKV6: ggml_sycl_op_rwkv_wkv6(ctx, dst); break; + case GGML_OP_GATED_LINEAR_ATTN: + ggml_sycl_op_gated_linear_attn(ctx, dst); + break; default: return false; } @@ -4507,6 +4510,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_LEAKY_RELU: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_RWKV_WKV6: + case GGML_OP_GATED_LINEAR_ATTN: return true; default: return false; diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp new file mode 100644 index 000000000..eedb47486 --- /dev/null +++ b/ggml/src/ggml-sycl/gla.cpp @@ -0,0 +1,105 @@ +#include + +#include "common.hpp" + +template +static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, u_int T, u_int C, u_int H, float scale, + const float * k, const float * v, const float * r, const float * td, + const float * s, float * dst) { + const u_int head_size = HEAD_SIZE; + const u_int state_size = C * head_size; + const u_int n_seq_tokens = T / B; + sycl::range<1> block_dims((C / H)); + sycl::range<1> grid_dims((B * H)); + stream->submit([&](sycl::handler & cgh) { + /* local memory accessors*/ + auto _k = sycl::local_accessor(sycl::range<1>(head_size), cgh); + auto _r = sycl::local_accessor(sycl::range<1>(head_size), cgh); + auto _td = sycl::local_accessor(sycl::range<1>(head_size), cgh); + + cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) { + u_int tid = item.get_local_id(0); + u_int bid = item.get_group(0); + + u_int batch_i = bid / H; + u_int head_i = bid % H; + + float state[head_size]; + +#pragma unroll + for (u_int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; + } + + for (u_int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; + t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) { + + item.barrier(sycl::access::fence_space::local_space); //sync threads + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + item.barrier(sycl::access::fence_space::local_space); //sync threads + + const float _v = v[t]; + float y = 0; + + for (u_int j = 0; j < head_size; j += 4) { + const sycl::float4 & k = (sycl::float4 &) (_k[j]); + const sycl::float4 & r = (sycl::float4 &) (_r[j]); + const sycl::float4 & td = (sycl::float4 &) (_td[j]); + sycl::float4 & s = (sycl::float4 &) (state[j]); + sycl::float4 kv; + + kv.x() = k.x() * _v; + kv.y() = k.y() * _v; + kv.z() = k.z() * _v; + kv.w() = k.w() * _v; + + s.x() = s.x() * td.x() + kv.x(); + s.y() = s.y() * td.y() + kv.y(); + s.z() = s.z() * td.z() + kv.z(); + s.w() = s.w() * td.w() + kv.w(); + + y += r.x() * s.x(); + y += r.y() * s.y(); + y += r.z() * s.z(); + y += r.w() * s.w(); + } + dst[t] = y * scale; + } +#pragma unroll + for (u_int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; + } + }); + }); +} + +void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const float * k_d = static_cast(dst->src[0]->data); + const float * v_d = static_cast(dst->src[1]->data); + const float * r_d = static_cast(dst->src[2]->data); + const float * td_d = static_cast(dst->src[3]->data); + const float * s_d = static_cast(dst->src[4]->data); + + const int64_t B = dst->src[4]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + dpct::queue_ptr stream = ctx.stream(); + GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == 64 || C / H == 128); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + + float * dst_d = (float *) dst->data; + + if (C / H == 64) { + gated_linear_attn_f32_kernel<64>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d); + } else { + gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d); + } +} diff --git a/ggml/src/ggml-sycl/gla.hpp b/ggml/src/ggml-sycl/gla.hpp new file mode 100644 index 000000000..607cf3a7f --- /dev/null +++ b/ggml/src/ggml-sycl/gla.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_GLA_HPP +#define GGML_SYCL_GLA_HPP + +#include "common.hpp" + +void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_GLA_HPP From 0ccd7f3eb2debe477ffe3c44d5353cc388c9418d Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 15 Jan 2025 05:44:38 +0100 Subject: [PATCH 03/30] examples : add embd_to_audio to tts-outetts.py [no ci] (#11235) This commit contains a suggestion for adding the missing embd_to_audio function from tts.cpp to tts-outetts.py. This introduces a depencency numpy which I was not sure if that is acceptable or not (only PyTorch was mentioned in referened PR). Also the README has been updated with instructions to run the example with llama-server and the python script. Refs: https://github.com/ggerganov/llama.cpp/pull/10784#issuecomment-2548377734 --- examples/tts/README.md | 37 +++++++++++ examples/tts/tts-outetts.py | 128 +++++++++++++++++++++++++++++++++++- 2 files changed, 163 insertions(+), 2 deletions(-) diff --git a/examples/tts/README.md b/examples/tts/README.md index b0d20111a..4509763c6 100644 --- a/examples/tts/README.md +++ b/examples/tts/README.md @@ -78,3 +78,40 @@ play the audio: $ aplay output.wav ``` +### Running the example with llama-server +Running this example with `llama-server` is also possible and requires two +server instances to be started. One will serve the LLM model and the other +will serve the voice decoder model. + +The LLM model server can be started with the following command: +```console +$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020 +``` + +And the voice decoder model server can be started using: +```console +./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none +``` + +Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio. + +First create a virtual environment for python and install the required +dependencies (this in only required to be done once): +```console +$ python3 -m venv venv +$ source venv/bin/activate +(venv) pip install requests numpy +``` + +And then run the python script using: +```conole +(venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world" +spectrogram generated: n_codes: 90, n_embd: 1282 +converting to audio ... +audio generated: 28800 samples +audio written to file "output.wav" +``` +And to play the audio we can again use aplay or any other media player: +```console +$ aplay output.wav +``` diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py index 0f81192fc..3791f9fc3 100644 --- a/examples/tts/tts-outetts.py +++ b/examples/tts/tts-outetts.py @@ -3,6 +3,121 @@ import sys #import struct import requests import re +import struct +import numpy as np +from concurrent.futures import ThreadPoolExecutor + + +def fill_hann_window(size, periodic=True): + if periodic: + return np.hanning(size + 1)[:-1] + return np.hanning(size) + + +def irfft(n_fft, complex_input): + return np.fft.irfft(complex_input, n=n_fft) + + +def fold(buffer, n_out, n_win, n_hop, n_pad): + result = np.zeros(n_out) + n_frames = len(buffer) // n_win + + for i in range(n_frames): + start = i * n_hop + end = start + n_win + result[start:end] += buffer[i * n_win:(i + 1) * n_win] + + return result[n_pad:-n_pad] if n_pad > 0 else result + + +def process_frame(args): + l, n_fft, ST, hann = args + frame = irfft(n_fft, ST[l]) + frame = frame * hann + hann2 = hann * hann + return frame, hann2 + + +def embd_to_audio(embd, n_codes, n_embd, n_thread=4): + embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd) + + n_fft = 1280 + n_hop = 320 + n_win = 1280 + n_pad = (n_win - n_hop) // 2 + n_out = (n_codes - 1) * n_hop + n_win + + hann = fill_hann_window(n_fft, True) + + E = np.zeros((n_embd, n_codes), dtype=np.float32) + for l in range(n_codes): + for k in range(n_embd): + E[k, l] = embd[l, k] + + half_embd = n_embd // 2 + S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64) + + for k in range(half_embd): + for l in range(n_codes): + mag = E[k, l] + phi = E[k + half_embd, l] + + mag = np.clip(np.exp(mag), 0, 1e2) + S[l, k] = mag * np.exp(1j * phi) + + res = np.zeros(n_codes * n_fft) + hann2_buffer = np.zeros(n_codes * n_fft) + + with ThreadPoolExecutor(max_workers=n_thread) as executor: + args = [(l, n_fft, S, hann) for l in range(n_codes)] + results = list(executor.map(process_frame, args)) + + for l, (frame, hann2) in enumerate(results): + res[l*n_fft:(l+1)*n_fft] = frame + hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2 + + audio = fold(res, n_out, n_win, n_hop, n_pad) + env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad) + + mask = env > 1e-10 + audio[mask] /= env[mask] + + return audio + + +def save_wav(filename, audio_data, sample_rate): + num_channels = 1 + bits_per_sample = 16 + bytes_per_sample = bits_per_sample // 8 + data_size = len(audio_data) * bytes_per_sample + byte_rate = sample_rate * num_channels * bytes_per_sample + block_align = num_channels * bytes_per_sample + chunk_size = 36 + data_size # 36 = size of header minus first 8 bytes + + header = struct.pack( + '<4sI4s4sIHHIIHH4sI', + b'RIFF', + chunk_size, + b'WAVE', + b'fmt ', + 16, # fmt chunk size + 1, # audio format (PCM) + num_channels, + sample_rate, + byte_rate, + block_align, + bits_per_sample, + b'data', + data_size + ) + + audio_data = np.clip(audio_data * 32767, -32768, 32767) + pcm_data = audio_data.astype(np.int16) + + with open(filename, 'wb') as f: + f.write(header) + f.write(pcm_data.tobytes()) + def process_text(text: str): text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed @@ -170,6 +285,15 @@ n_embd = len(embd[0]) print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd)) # post-process the spectrogram to convert to audio -# TODO: see the tts.cpp:embd_to_audio() and implement it in Python print('converting to audio ...') -print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python') +audio = embd_to_audio(embd, n_codes, n_embd) +print('audio generated: %d samples' % len(audio)) + +filename = "output.wav" +sample_rate = 24000 # sampling rate + +# zero out first 0.25 seconds +audio[:24000 // 4] = 0.0 + +save_wav(filename, audio, sample_rate) +print('audio written to file "%s"' % filename) From 432df2d5f901a8ac93d1befad916144a0478bd9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 15 Jan 2025 12:51:37 +0100 Subject: [PATCH 04/30] RoPE: fix back, CUDA support for back + noncont. (#11240) * RoPE: fix back, CUDA support for back + noncont. * fix comments reg. non-cont. RoPE support [no-ci] --- ggml/include/ggml.h | 19 +- ggml/src/ggml-cpu/ggml-cpu.c | 1 + ggml/src/ggml-cpu/ggml-cpu.cpp | 2 - ggml/src/ggml-cuda/ggml-cuda.cu | 10 +- ggml/src/ggml-cuda/rope.cu | 366 ++++++++++++++------------------ ggml/src/ggml-cuda/rope.cuh | 2 + ggml/src/ggml.c | 56 ++--- src/llama.cpp | 8 +- tests/test-backend-ops.cpp | 63 ++++-- 9 files changed, 269 insertions(+), 258 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8f8cb9e1a..a9c051cd5 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1500,7 +1500,7 @@ extern "C" { // rotary position embedding backward, i.e compute dx from dy // a - dy - GGML_API struct ggml_tensor * ggml_rope_back( + GGML_API struct ggml_tensor * ggml_rope_ext_back( struct ggml_context * ctx, struct ggml_tensor * a, // gradients of ggml_rope result struct ggml_tensor * b, // positions @@ -1515,6 +1515,23 @@ extern "C" { float beta_fast, float beta_slow); + GGML_API struct ggml_tensor * ggml_rope_multi_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int sections[4], + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // clamp // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_clamp( diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 2966ff768..7c2e45f86 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13668,6 +13668,7 @@ struct ggml_cplan ggml_graph_plan( } break; case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index f11399cc6..5c47ceb73 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -403,8 +403,6 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type; - case GGML_OP_ROPE_BACK: - return op->src[2] == NULL && (op->op_params[2] & 4) == 0; case GGML_OP_IM2COL_BACK: return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; case GGML_OP_OUT_PROD: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1dac397c4..9118edc72 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2141,6 +2141,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_ROPE: ggml_cuda_op_rope(ctx, dst); break; + case GGML_OP_ROPE_BACK: + ggml_cuda_op_rope_back(ctx, dst); + break; case GGML_OP_IM2COL: ggml_cuda_op_im2col(ctx, dst); break; @@ -3025,7 +3028,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_SOFT_MAX: return true; case GGML_OP_ROPE: - return ggml_is_contiguous(op->src[0]); + case GGML_OP_ROPE_BACK: { + const size_t ts = ggml_type_size(op->src[0]->type); + const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2]; + return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts; + } case GGML_OP_IM2COL: case GGML_OP_POOL_2D: case GGML_OP_SUM: @@ -3081,6 +3088,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { return op->ne[1]; case GGML_OP_MUL_MAT_ID: case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: return op->ne[2]; default: return ggml_nrows(op); diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 2c84778d2..e1912fee1 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -16,9 +16,10 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +template static __device__ void rope_yarn( - float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, - float * cos_theta, float * sin_theta) { + const float theta_extrap, const float freq_scale, const rope_corr_dims corr_dims, const int64_t i0, const float ext_factor, + float mscale, float & cos_theta, float & sin_theta) { // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; float theta = theta_interp; @@ -29,24 +30,28 @@ static __device__ void rope_yarn( // Get n-d magnitude scaling corrected for interpolation mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); } - *cos_theta = cosf(theta) * mscale; - *sin_theta = sinf(theta) * mscale; + cos_theta = cosf(theta) * mscale; + sin_theta = sinf(theta) * mscale; + if (!forward) { + sin_theta *= -1.0f; + } } -template +template static __global__ void rope_norm( - const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, + const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { return; } - const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= n_dims) { - const int i = row*ne0 + i0; + const int i = row_dst*ne0 + i0; dst[i + 0] = x[i + 0]; dst[i + 1] = x[i + 1]; @@ -54,39 +59,43 @@ static __global__ void rope_norm( return; } - const int i = row*ne0 + i0; - const int i2 = row/p_delta_rows; + const int row_x = row_dst % ne1; + const int channel_x = row_dst / ne1; - const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + const int idst = row_dst*ne0 + i0; + const int ix = channel_x*s2 + row_x*s1 + i0; + + const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; float cos_theta; float sin_theta; - rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta); - const float x0 = x[i + 0]; - const float x1 = x[i + 1]; + const float x0 = x[ix + 0]; + const float x1 = x[ix + 1]; - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + 1] = x0*sin_theta + x1*cos_theta; + dst[idst + 0] = x0*cos_theta - x1*sin_theta; + dst[idst + 1] = x0*sin_theta + x1*cos_theta; } -template +template static __global__ void rope_neox( - const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, + const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { return; } - const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= n_dims) { - const int i = row*ne0 + i0; + const int i = row_dst*ne0 + i0; dst[i + 0] = x[i + 0]; dst[i + 1] = x[i + 1]; @@ -94,39 +103,43 @@ static __global__ void rope_neox( return; } - const int i = row*ne0 + i0/2; - const int i2 = row/p_delta_rows; + const int row_x = row_dst % ne1; + const int channel_x = row_dst / ne1; - const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + const int idst = row_dst*ne0 + i0/2; + const int ix = channel_x*s2 + row_x*s1 + i0/2; + + const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; float cos_theta; float sin_theta; - rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta); - const float x0 = x[i + 0]; - const float x1 = x[i + n_dims/2]; + const float x0 = x[ix + 0]; + const float x1 = x[ix + n_dims/2]; - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; + dst[idst + 0] = x0*cos_theta - x1*sin_theta; + dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta; } -template +template static __global__ void rope_multi( - const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, + const int n_dims, const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { return; } - const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= n_dims) { - const int i = row*ne0 + i0; + const int i = row_dst*ne0 + i0; dst[i + 0] = x[i + 0]; dst[i + 1] = x[i + 1]; @@ -134,25 +147,28 @@ static __global__ void rope_multi( return; } - const int i = row*ne0 + i0/2; - const int i2 = row/p_delta_rows; + const int row_x = row_dst % ne1; + const int channel_x = row_dst / ne1; - int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3]; - int sec_w = sections.v[1] + sections.v[0]; - int sector = (i0 / 2) % sect_dims; + const int idst = row_dst*ne0 + i0/2; + const int ix = channel_x*s2 + row_x*s1 + i0/2; + + const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3]; + const int sec_w = sections.v[1] + sections.v[0]; + const int sector = (i0 / 2) % sect_dims; float theta_base = 0.0; if (sector < sections.v[0]) { - theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); } else if (sector >= sections.v[0] && sector < sec_w) { - theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f); + theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); } else if (sector >= sec_w && sector < sec_w + sections.v[2]) { - theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f); + theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); } else if (sector >= sec_w + sections.v[2]) { - theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f); + theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); } const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -160,42 +176,46 @@ static __global__ void rope_multi( float cos_theta; float sin_theta; - rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta); - const float x0 = x[i + 0]; - const float x1 = x[i + n_dims/2]; + const float x0 = x[ix + 0]; + const float x1 = x[ix + n_dims/2]; - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; + dst[idst + 0] = x0*cos_theta - x1*sin_theta; + dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta; } -template +template static __global__ void rope_vision( - const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, + const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, + const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { return; } - const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - const int i = row*ne0 + i0/2; - const int i2 = row/p_delta_rows; // i2-th tokens + const int row_x = row_dst % ne1; + const int channel_x = row_dst / ne1; - int sect_dims = sections.v[0] + sections.v[1]; - int sec_w = sections.v[1] + sections.v[0]; - int sector = (i0 / 2) % sect_dims; + const int idst = row_dst*ne0 + i0/2; + const int ix = channel_x*s2 + row_x*s1 + i0/2; + + const int sect_dims = sections.v[0] + sections.v[1]; + const int sec_w = sections.v[1] + sections.v[0]; + const int sector = (i0 / 2) % sect_dims; float theta_base = 0.0; if (sector < sections.v[0]) { const int p = sector; - theta_base = pos[i2]*powf(theta_scale, p); + theta_base = pos[channel_x]*powf(theta_scale, p); } else if (sector >= sections.v[0] && sector < sec_w) { const int p = sector - sections.v[0]; - theta_base = pos[i2 + ne2]*powf(theta_scale, p); + theta_base = pos[channel_x + ne2]*powf(theta_scale, p); } const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -203,19 +223,20 @@ static __global__ void rope_vision( float cos_theta; float sin_theta; - rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta); - const float x0 = x[i + 0]; - const float x1 = x[i + n_dims]; + const float x0 = x[ix + 0]; + const float x1 = x[ix + n_dims]; - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + n_dims] = x0*sin_theta + x1*cos_theta; + dst[idst + 0] = x0*cos_theta - x1*sin_theta; + dst[idst + n_dims] = x0*sin_theta + x1*cos_theta; } -template +template static void rope_norm_cuda( - const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -224,22 +245,21 @@ static void rope_norm_cuda( const float theta_scale = powf(freq_base, -2.0f/n_dims); if (freq_factors == nullptr) { - rope_norm<<>>( - x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors - ); + rope_norm<<>>( + x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors); } else { - rope_norm<<>>( - x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors - ); + rope_norm<<>>( + x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors); } } -template +template static void rope_neox_cuda( - const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -248,22 +268,21 @@ static void rope_neox_cuda( const float theta_scale = powf(freq_base, -2.0f/n_dims); if (freq_factors == nullptr) { - rope_neox<<>>( - x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors - ); + rope_neox<<>>( + x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors); } else { - rope_neox<<>>( - x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors - ); + rope_neox<<>>( + x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors); } } -template +template static void rope_multi_cuda( - const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -272,22 +291,21 @@ static void rope_multi_cuda( const float theta_scale = powf(freq_base, -2.0f/n_dims); if (freq_factors == nullptr) { - rope_multi<<>>( - x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, sections - ); + rope_multi<<>>( + x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, sections); } else { - rope_multi<<>>( - x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, sections - ); + rope_multi<<>>( + x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, sections); } } -template +template static void rope_vision_cuda( - const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) { + const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -298,80 +316,18 @@ static void rope_vision_cuda( const float theta_scale = powf(freq_base, -2.0f/n_dims); if (freq_factors == nullptr) { - rope_vision<<>>( - x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, sections - ); + rope_vision<<>>( + x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, sections); } else { - rope_vision<<>>( - x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, sections - ); + rope_vision<<>>( + x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, sections); } } -static void rope_norm_cuda_f16( - const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { - - rope_norm_cuda(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); -} - -static void rope_norm_cuda_f32( - const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { - - rope_norm_cuda(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); -} - -static void rope_neox_cuda_f16( - const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { - - rope_neox_cuda(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); -} - -static void rope_neox_cuda_f32( - const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream -) { - - rope_neox_cuda(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); -} - -static void rope_multi_cuda_f16( - const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream -) { - - rope_multi_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); -} - -static void rope_multi_cuda_f32( - const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream -) { - - rope_multi_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); -} - -static void rope_vision_cuda_f16( - const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream -) { - - rope_vision_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); -} - -static void rope_vision_cuda_f32( - const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream -) { - - rope_vision_cuda(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); -} - -void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +template +void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src2 = dst->src[2]; @@ -382,7 +338,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == dst->type); @@ -392,6 +347,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ne02 = src0->ne[2]; // num heads const int64_t nr = ggml_nrows(src0); + const size_t s01 = src0->nb[1] / ggml_type_size(src0->type); + const size_t s02 = src0->nb[2] / ggml_type_size(src0->type); + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; @@ -440,59 +398,59 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { // compute if (is_neox) { if (src0->type == GGML_TYPE_F32) { - rope_neox_cuda_f32( - (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, stream - ); + rope_neox_cuda( + (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); } else if (src0->type == GGML_TYPE_F16) { - rope_neox_cuda_f16( - (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, stream - ); + rope_neox_cuda( + (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); } else { GGML_ABORT("fatal error"); } } else if (is_mrope && !is_vision) { if (src0->type == GGML_TYPE_F32) { - rope_multi_cuda_f32( - (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, sections, stream - ); + rope_multi_cuda( + (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); } else if (src0->type == GGML_TYPE_F16) { - rope_multi_cuda_f16( - (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, sections, stream - ); + rope_multi_cuda( + (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); } else { GGML_ABORT("fatal error"); } } else if (is_vision) { if (src0->type == GGML_TYPE_F32) { - rope_vision_cuda_f32( - (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, sections, stream - ); + rope_vision_cuda( + (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); } else if (src0->type == GGML_TYPE_F16) { - rope_vision_cuda_f16( - (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, sections, stream - ); + rope_vision_cuda( + (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); } else { GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { - rope_norm_cuda_f32( - (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, stream - ); + rope_norm_cuda( + (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); } else if (src0->type == GGML_TYPE_F16) { - rope_norm_cuda_f16( - (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, stream - ); + rope_norm_cuda( + (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); } else { GGML_ABORT("fatal error"); } } } + +void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_rope_impl(ctx, dst); +} + +void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_rope_impl(ctx, dst); +} diff --git a/ggml/src/ggml-cuda/rope.cuh b/ggml/src/ggml-cuda/rope.cuh index 0f787a0b2..9139f3b22 100644 --- a/ggml/src/ggml-cuda/rope.cuh +++ b/ggml/src/ggml-cuda/rope.cuh @@ -3,3 +3,5 @@ #define CUDA_ROPE_BLOCK_SIZE 256 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index da5b817e1..ecfb84a80 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3695,7 +3695,7 @@ void ggml_rope_yarn_corr_dims( // ggml_rope_back -struct ggml_tensor * ggml_rope_back( +struct ggml_tensor * ggml_rope_ext_back( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -3709,29 +3709,32 @@ struct ggml_tensor * ggml_rope_back( float attn_factor, float beta_fast, float beta_slow) { - GGML_ASSERT(ggml_is_vector(b)); - GGML_ASSERT(b->type == GGML_TYPE_I32); - GGML_ASSERT(a->ne[2] == b->ne[0]); - - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - - int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig }; - memcpy(params + 5, &freq_base, sizeof(float)); - memcpy(params + 6, &freq_scale, sizeof(float)); - memcpy(params + 7, &ext_factor, sizeof(float)); - memcpy(params + 8, &attn_factor, sizeof(float)); - memcpy(params + 9, &beta_fast, sizeof(float)); - memcpy(params + 10, &beta_slow, sizeof(float)); - ggml_set_op_params(result, params, sizeof(params)); - - result->op = GGML_OP_ROPE_BACK; - result->src[0] = a; - result->src[1] = b; - result->src[2] = c; - + struct ggml_tensor * result = ggml_rope_ext( + ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + result->op = GGML_OP_ROPE_BACK; return result; } +struct ggml_tensor * ggml_rope_multi_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int sections[4], + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + struct ggml_tensor * result = ggml_rope_multi( + ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + result->op = GGML_OP_ROPE_BACK; + return result; +} // ggml_clamp struct ggml_tensor * ggml_clamp( @@ -5594,6 +5597,7 @@ static void ggml_compute_backward( //const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4]; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + int sections[4] = {0, 0, 0, 0}; memcpy(&freq_base, (const float *) tensor->op_params + 5, sizeof(float)); memcpy(&freq_scale, (const float *) tensor->op_params + 6, sizeof(float)); @@ -5601,10 +5605,14 @@ static void ggml_compute_backward( memcpy(&attn_factor, (const float *) tensor->op_params + 8, sizeof(float)); memcpy(&beta_fast, (const float *) tensor->op_params + 9, sizeof(float)); memcpy(&beta_slow, (const float *) tensor->op_params + 10, sizeof(float)); + memcpy(§ions, tensor->op_params + 11, sizeof(sections)); - ggml_add_or_set(ctx, cgraph, isrc0, - ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base, - freq_scale, ext_factor, attn_factor, beta_fast, beta_slow)); + struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ? + ggml_rope_ext_back(ctx, grad, src1, src2, n_dims, + mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) : + ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections, + mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + ggml_add_or_set(ctx, cgraph, isrc0, rope_back); } GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented"); } break; diff --git a/src/llama.cpp b/src/llama.cpp index daf1b7c97..2e391b3b6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4642,7 +4642,7 @@ struct llm_build_context { 0); cb(v_states, "v_states", il); - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -4651,7 +4651,7 @@ struct llm_build_context { cb(q_pe, "q_pe", il); // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this k_pe = ggml_rope_ext( ctx0, k_pe, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -6496,7 +6496,7 @@ struct llm_build_context { 0); cb(v_states, "v_states", il); - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this q_pe = ggml_rope_ext( ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -6505,7 +6505,7 @@ struct llm_build_context { cb(q_pe, "q_pe", il); // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this k_pe = ggml_rope_ext( ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 3834e0f84..4c8464d8b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2192,7 +2192,7 @@ struct test_soft_max : public test_case { }; -// GGML_OP_ROPE +// GGML_OP_ROPE + GGML_OP_ROPE_BACK struct test_rope : public test_case { const ggml_type type; const std::array ne_a; @@ -2204,29 +2204,36 @@ struct test_rope : public test_case { float af; // attn_factor bool ff; int v; // view (1 : non-contiguous a) + bool forward; std::string vars() override { + // forward can be inferred from the op, does not need to be printed return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v); } test_rope(ggml_type type = GGML_TYPE_F32, std::array ne_a = {10, 5, 3, 1}, - int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0) - : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {} + int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, + float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true) + : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a; if (v & 1) { auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3; a = ggml_new_tensor(ctx, type, 4, ne.data()); - ggml_set_param(ctx, a); + if (forward) { + ggml_set_param(ctx, a); + } ggml_set_name(a, "a"); a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); ggml_set_name(a, "view_of_a"); } else { a = ggml_new_tensor(ctx, type, 4, ne_a.data()); - ggml_set_param(ctx, a); + if (forward) { + ggml_set_param(ctx, a); + } ggml_set_name(a, "a"); } @@ -2252,14 +2259,26 @@ struct test_rope : public test_case { if (is_vision) { GGML_ASSERT(n_dims/4 > 0); int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate - out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + if (forward) { + out = ggml_rope_multi (ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + } else { + out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + } } else { GGML_ASSERT(n_dims/3 > 0); int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0}; - out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + if (forward) { + out = ggml_rope_multi (ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + } else { + out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + } } } else { - out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + if (forward) { + out = ggml_rope_ext (ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + } else { + out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + } } ggml_set_name(out, "out"); @@ -3844,7 +3863,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); - { + for (bool fw : {true, false}) { // fw == forward bool all = true; for (float v : { 0, 1 }) { @@ -3853,29 +3872,29 @@ static std::vector> make_test_cases_eval() { for (float af : { 1.0f, 1.4245f }) { for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { for (bool ff : {false, true}) { // freq_factors - test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B + test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B if (all) { - test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B - test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B - test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B + test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 13B + test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 30B + test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 65B } if (all) { - test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B) - test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B) - test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B) - test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm) - test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2) + test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B) + test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B) + test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B) + test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm) + test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2) } if (all) { - test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B) - test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B) - test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl ViT) + test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B) + test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B) + test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT) } - test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B) + test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B) } } From 1d8504338ea27c7331998c11afdbd47ce5a9daac Mon Sep 17 00:00:00 2001 From: Junil Kim Date: Wed, 15 Jan 2025 22:17:42 +0900 Subject: [PATCH 05/30] fix: ggml: fix vulkan-shaders-gen build (#10448) * fix: ggml: fix vulkan-shaders-gen build The vulkan-shaders-gen target was not being built correctly in case of cross-compilation. Other outputs need to be built for the cross compile target, but vulkan-shaders-gen needs to be built for the host. * refactor: ggml: Improve vulkan-shaders-gen toolchain setup - Add GGML_SHADERS_GEN_TOOLCHAIN CMake option. - Auto-detect host toolchain if not set. * refactor: ggml: Improve vulkan-shaders-gen toolchain setup Use configure_file to generate host_toolchain.cmake from template * fix: ggml: Fix compile error Fix compile error not finding vulkan-shaders-gen * fix: vulkan-shaders-gen build and path handling Fix build issues with vulkan-shaders-gen: - Add target dependency for correct build order - Use CMAKE_HOST_SYSTEM_NAME for executable suffix - Fix MSVC output directory in host toolchain - Normalize path handling for cross-compilation * fix: improve host compiler detection in vulkan shader build Improve host compiler detection for vulkan shader generation: - Add NO_CMAKE_FIND_ROOT_PATH to all compiler searches - Consolidate compiler detection logic - Fix Windows-specific MSVC detection - Ensure correct compiler search in cross-compilation * refactor: Simplify CMake function for detecting host compiler Simplified the CMake function to improve the process of detecting the host compiler. * fix: Remove unnecessary Vulkan library linkage in CMakeLists.txt Since `vulkan-shader-gen.cpp` only requires the `glslc` executable and not the Vulkan headers or libraries, CMakeLists.txt needs to be corrected. (See: ecc93d0558fc3ecb8a5af69d2ece02fae4710ade) * refactor: Rename host_toolchain.cmake.in - Rename host_toolchain.cmake.in to cmake/host-toolchain.cmake.in * refactor: GGML_VULKAN_SHADERS_GEN_TOOLCHAIN Rename the macro GGML_SHADERS_GEN_TOOLCHAIN to GGML_VULKAN_SHADERS_GEN_TOOLCHAIN --- ggml/CMakeLists.txt | 3 + ggml/src/ggml-vulkan/CMakeLists.txt | 64 +++++++++++++++++-- .../ggml-vulkan/cmake/host-toolchain.cmake.in | 15 +++++ .../ggml-vulkan/vulkan-shaders/CMakeLists.txt | 6 +- .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 - 5 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index fe8acc803..185079aa4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -185,6 +185,9 @@ option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increas option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON) option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON) +# toolchain for vulkan-shaders-gen +set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen") + # extra artifacts option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index c0ddaac82..d970f7e20 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -1,5 +1,20 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + find_package(Vulkan COMPONENTS glslc REQUIRED) +function(detect_host_compiler) + if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") + find_program(HOST_C_COMPILER NAMES cl gcc clang NO_CMAKE_FIND_ROOT_PATH) + find_program(HOST_CXX_COMPILER NAMES cl g++ clang++ NO_CMAKE_FIND_ROOT_PATH) + else() + find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH) + find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH) + endif() + set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE) + set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE) +endfunction() + if (Vulkan_FOUND) message(STATUS "Vulkan found") @@ -73,19 +88,56 @@ if (Vulkan_FOUND) add_compile_definitions(GGML_VULKAN_RUN_TESTS) endif() - add_subdirectory(vulkan-shaders) + if (NOT CMAKE_CROSSCOMPILING) + add_subdirectory(vulkan-shaders) + if (MSVC) + foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER ${CONFIG} CONFIG) + set_target_properties(vulkan-shaders-gen PROPERTIES + RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + endforeach() + endif() + else() + if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN) + set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN}) + else() + detect_host_compiler() + if (NOT HOST_C_COMPILER OR NOT HOST_CXX_COMPILER) + message(FATAL_ERROR "Host compiler not found") + else() + message(STATUS "Host compiler: ${HOST_C_COMPILER} ${HOST_CXX_COMPILER}") + endif() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/host-toolchain.cmake.in ${CMAKE_BINARY_DIR}/host-toolchain.cmake @ONLY) + set(HOST_CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/host-toolchain.cmake) + endif() + message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}") - set (_ggml_vk_genshaders_cmd vulkan-shaders-gen) + include(ExternalProject) + # Native build through ExternalProject_Add + ExternalProject_Add( + vulkan-shaders-gen + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders + CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE} + -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR} + BUILD_COMMAND ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} --install . + INSTALL_DIR ${CMAKE_BINARY_DIR} + ) + ExternalProject_Add_StepTargets(vulkan-shaders-gen build install) + endif() + set (_ggml_vk_host_suffix $,.exe,>) + set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix}) set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp) set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp) set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders) set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv) file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp") + set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen) - if (NOT CMAKE_CROSSCOMPILING) - set(_ggml_vk_genshaders_cmd "$/${_ggml_vk_genshaders_cmd}") - endif () + if (CMAKE_CROSSCOMPILING) + set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install) + endif() add_custom_command( OUTPUT ${_ggml_vk_header} @@ -99,7 +151,7 @@ if (Vulkan_FOUND) --target-cpp ${_ggml_vk_source} --no-clean - DEPENDS ${_ggml_vk_shader_deps} ${_ggml_vk_genshaders_cmd} + DEPENDS ${_ggml_vk_shader_deps} COMMENT "Generate vulkan shaders" ) diff --git a/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in b/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in new file mode 100644 index 000000000..b6af747a5 --- /dev/null +++ b/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in @@ -0,0 +1,15 @@ +set(CMAKE_BUILD_TYPE Release) +set(CMAKE_C_FLAGS -O2) +set(CMAKE_CXX_FLAGS -O2) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) +set(CMAKE_C_COMPILER @HOST_C_COMPILER@) +set(CMAKE_CXX_COMPILER @HOST_CXX_COMPILER@) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY @CMAKE_RUNTIME_OUTPUT_DIRECTORY@) + +if("@CMAKE_C_COMPILER_ID@" STREQUAL "MSVC") + foreach(CONFIG IN ITEMS DEBUG RELEASE MINSIZEREL RELWITHDEBINFO) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + endforeach() +endif() diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index bd0c74cb1..074031087 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -1,9 +1,11 @@ find_package (Threads REQUIRED) -find_package(Vulkan COMPONENTS glslc REQUIRED) +find_program(GLSLC_EXECUTABLE glslc) +if(NOT GLSLC_EXECUTABLE) + message(FATAL_ERROR "glslc not found.") +endif() set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_17) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) -target_link_libraries(vulkan-shaders-gen PRIVATE Vulkan::Vulkan) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 7b5044798..243839917 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,8 +30,6 @@ #include #endif -#include - #define ASYNCIO_CONCURRENCY 64 std::mutex lock; From f11cfdfd7fe29436fce512d934c2ff6b94bd89d2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 15 Jan 2025 18:28:35 +0200 Subject: [PATCH 06/30] ci : use -no-cnv in gguf-split tests (#11254) * ci : use -no-cnv in gguf-split tests ggml-ci * ci : use -no-cnv in requantize tests ggml-ci * scripts : fix [no ci] --- examples/gguf-split/tests.sh | 10 +-- examples/quantize/tests.sh | 4 +- scripts/hf.sh | 112 ------------------------ tests/test-lora-conversion-inference.sh | 6 +- 4 files changed, 10 insertions(+), 122 deletions(-) delete mode 100755 scripts/hf.sh diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index d5a92d605..05a932227 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -41,7 +41,7 @@ echo PASS echo # 2b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -51,7 +51,7 @@ echo PASS echo # 3b. Test the merged model is loading properly -$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 echo PASS echo @@ -61,7 +61,7 @@ echo PASS echo # 4b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 echo PASS echo @@ -71,7 +71,7 @@ echo #echo # 5b. Test the merged model is loading properly -#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 +#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 #echo PASS #echo @@ -81,7 +81,7 @@ echo PASS echo # 6b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 echo PASS echo diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh index 24bc970e8..70f7610f9 100644 --- a/examples/quantize/tests.sh +++ b/examples/quantize/tests.sh @@ -47,7 +47,7 @@ echo PASS echo # 3a. Test the requanted model is loading properly -$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -57,7 +57,7 @@ echo PASS echo # 4b. Test the requanted model is loading properly -$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 echo PASS echo diff --git a/scripts/hf.sh b/scripts/hf.sh deleted file mode 100755 index b251925fa..000000000 --- a/scripts/hf.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -# -# Shortcut for downloading HF models -# -# Usage: -# ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) -# - -# all logs go to stderr -function log { - echo "$@" 1>&2 -} - -function usage { - log "Usage: $0 [[--url] ] [--repo ] [--file ] [--outdir [-h|--help]" - exit 1 -} - -# check for curl or wget -function has_cmd { - if ! [ -x "$(command -v $1)" ]; then - return 1 - fi -} - -if has_cmd wget; then - cmd="wget -q -c -O %s/%s %s" -elif has_cmd curl; then - cmd="curl -C - -f --output-dir %s -o %s -L %s" -else - log "[E] curl or wget not found" - exit 1 -fi - -url="" -repo="" -file="" -outdir="." - -# parse args -while [[ $# -gt 0 ]]; do - case "$1" in - --url) - url="$2" - shift 2 - ;; - --repo) - repo="$2" - shift 2 - ;; - --file) - file="$2" - shift 2 - ;; - --outdir) - outdir="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - url="$1" - shift - ;; - esac -done - -if [ -n "$repo" ] && [ -n "$file" ]; then - url="https://huggingface.co/$repo/resolve/main/$file" -fi - -if [ -z "$url" ]; then - log "[E] missing --url" - usage -fi - -# check if the URL is a HuggingFace model, and if so, try to download it -is_url=false - -if [[ ${#url} -gt 22 ]]; then - if [[ ${url:0:22} == "https://huggingface.co" ]]; then - is_url=true - fi -fi - -if [ "$is_url" = false ]; then - log "[E] invalid URL, must start with https://huggingface.co" - exit 0 -fi - -# replace "blob/main" with "resolve/main" -url=${url/blob\/main/resolve\/main} - -basename=$(basename $url) - -log "[+] attempting to download $basename" - -if [ -n "$cmd" ]; then - cmd=$(printf "$cmd" "$outdir" "$basename" "$url") - log "[+] $cmd" - if $cmd; then - echo $outdir/$basename - exit 0 - fi -fi - -log "[-] failed to download" - -exit 1 diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh index fb308a9ff..1d1f4886c 100755 --- a/tests/test-lora-conversion-inference.sh +++ b/tests/test-lora-conversion-inference.sh @@ -80,18 +80,18 @@ run_conversion_and_inference_lora() { # Run inference echo -e "\n\n---------------------------\n\n" echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..." - OUTPUT_BASE=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + OUTPUT_BASE=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0) echo -e "\n\n---------------------------\n\n" echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..." - OUTPUT_LORA_HOT=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + OUTPUT_LORA_HOT=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \ -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0) echo -e "\n\n---------------------------\n\n" echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..." - OUTPUT_LORA_MERGED=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \ + OUTPUT_LORA_MERGED=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \ -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0) # Remove any initial white space From adc5dd92e8aea98f5e7ac84f6e1bc15de35130b5 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 15 Jan 2025 19:50:13 +0000 Subject: [PATCH 07/30] vulkan: scale caching for k quants + misc fixes (#11081) * q6_k scale caching * 16 bit unpack * q4_k test (slow) * revert it * q3_k * q2_k * little stuff * try precalculating products of a and q2_k scales * Revert "try precalculating products of a and q2_k scales" This reverts commit 65110b81f23f66331a50c6e889a7c1ab9470a86b. * unpack should be u16, add vim swap to gitignore (about time) * better q4_k scales * q5_k * better q6_k with separate paths for all threads and partial threads in use, plus some more optimizations * q2_k better dequant * q3_k optimizations * q3_k use hmask simd from cpu avx version * make the caches happy * q3_k separate out calculation * q2_k separate out * little stuff * use calc_superblock everywhere * q2_k optimize scale calculation * more barriers --- .gitignore | 1 + .../vulkan-shaders/mul_mat_vec_q2_k.comp | 154 ++++++------ .../vulkan-shaders/mul_mat_vec_q3_k.comp | 141 ++++++----- .../vulkan-shaders/mul_mat_vec_q4_k.comp | 169 ++++++------- .../vulkan-shaders/mul_mat_vec_q5_k.comp | 229 +++++++++--------- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 144 ++++++----- 6 files changed, 454 insertions(+), 384 deletions(-) diff --git a/.gitignore b/.gitignore index 1df7cf4a1..694f36e04 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ *.metallib *.o *.so +*.swp *.tmp # IDE / OS diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index 6a9b9b2d1..8cdc640e8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -5,6 +5,80 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16]; +shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16]; + +FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; + +void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) { + const uint y_idx = i * QUANT_K + y_offset; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + + barrier(); + if (!all_threads) { // when we don't have enough blocks to use all threads + if (i < num_blocks_per_row) { + const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]); + sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF); + sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF); + } + barrier(); + + if (i >= num_blocks_per_row) + continue; + } else { + const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]); + sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF); + sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF); + barrier(); + } + + const uint32_t qs_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); + const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303)); + const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303)); + const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303)); + const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303)); + + vec2 d = vec2(data_a[ib0 + i].d); + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]); + vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]); + vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]); + vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]); + vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]); + vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]); + vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]); + vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]); + + FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); + FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); + [[unroll]] for (int l = 0; l < 2; ++l) { + sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ], + fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2], + fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ], + fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2], + fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ], + fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2], + fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ], + fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1)))))))); + sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im], + fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im], + fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im], + fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im], + fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im], + fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im], + fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im], + fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2)))))))); + } + temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n])); + } + } +} + void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -14,88 +88,28 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; - const uint itid = tid%16; // 0...16 - const uint ix = tid/16; + const uint itid = tid%16; // 0...15 + const uint ix = tid/16; - const uint step = 8; - - const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = itid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - 8*v_im; // 0...7 const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; - const uint s_offset = 8*v_im; const uint y_offset = 128*v_im + l0; - FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { temp[j][i] = FLOAT_TYPE(0); } } - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { - const uint y_idx = i * QUANT_K + y_offset; - - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - vec2 d = vec2(data_a[ib0 + i].d); - const FLOAT_TYPE dall = FLOAT_TYPE(d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - - uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0]; - uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1]; - - uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F; - uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F; - uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F; - uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F; - - uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32)); - uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32)); - uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32)); - uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32)); - - uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0]; - uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]; - uvec2 qs0 = uvec2(unpack8(qs0_u16)); - uvec2 qs16 = uvec2(unpack8(qs16_u16)); - - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]); - vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]); - vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]); - vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]); - vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]); - vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]); - vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]); - vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]); - - FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); - FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < 2; ++l) { - sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3), - fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3), - fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3), - fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3), - fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3), - fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3), - fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3), - fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1)))))))); - sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]), - fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]), - fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]), - fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]), - fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]), - fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]), - fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]), - fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2)))))))); - } - temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n])); - } - } - } + const uint nbr_par_th = num_blocks_per_row%it_size; + const uint nbr_all_th = num_blocks_per_row - nbr_par_th; + uint i0 = 0; + [[unroll]] for (; i0 < nbr_all_th; i0 += it_size) + calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true); + calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false); reduce_result(temp, d_offset, first_row, num_rows, tid); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 96ef50fdd..3116fad16 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -5,6 +5,74 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8]; + +FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; + +void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) { + const uint y_idx = i * QUANT_K + y_offset; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + + if (!all_threads) { // when we don't have enough blocks to use all threads + barrier(); + if (i < num_blocks_per_row) + sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32); + barrier(); + + if (i >= num_blocks_per_row) + continue; + } + + const uint32_t hmk = ~(uint32_t(data_a_packed16[ib0 + i].hmask[v_in]) | (uint32_t(data_a_packed16[ib0 + i].hmask[v_in + 8]) << 16)); + const vec4 hmk_0 = vec4(unpack8(((hmk & hm_m[0]) >> ( v_im4)) << 2)); + const vec4 hmk_1 = vec4(unpack8(((hmk & hm_m[1]) >> (1 + v_im4)) << 2)); + const vec4 hmk_2 = vec4(unpack8(((hmk & hm_m[2]) >> (2 + v_im4)) << 2)); + const vec4 hmk_3 = vec4(unpack8(((hmk & hm_m[3]) >> (3 + v_im4)) << 2)); + + // 0, 1, 16, 17 + uint32_t qs_u32 = uint32_t(data_a[ib0 + i].qs[q_offset]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 1]) << 8); + qs_u32 |= (uint32_t(data_a[ib0 + i].qs[q_offset + 16]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 17]) << 8)) << 16; + const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303)); + const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303)); + const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303)); + const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303)); + + if (all_threads) { + barrier(); + sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32); + barrier(); + } + + const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]); + vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]); + vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]); + vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]); + vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]); + vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]); + vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]); + vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0); + [[unroll]] for (int l = 0; l < 2; ++l) { + sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ], + fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2], + fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ], + fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2], + fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ], + fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2], + fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ], + fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum)))))))); + } + temp[j][n] = fma(d, sum, temp[j][n]); + } + } +} + void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -14,76 +82,37 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; - const uint itid = tid%16; // 0...16 - const uint ix = tid/16; + const uint itid = tid%16; // 0...15 + const uint ix = tid/16; + const uint itid8 = itid%8; - const uint step = 8; + const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_im4 = v_im*4; + const uint v_in = itid - 8*v_im; // 0...7 - const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = itid - step*v_im; // 0...15 or 0...7 - - const uint8_t m = uint8_t(1 << (4 * v_im)); + const uint32_t m = 0x01010101 << (4 * v_im); + uint32_t hm_m[4]; + [[unroll]] for (uint j = 0; j < 4; ++j) + hm_m[j] = m << j; const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint y_offset = 128*v_im + l0; - FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { temp[j][i] = FLOAT_TYPE(0); } } - const uint s_shift = 4 * v_im; + const uint s_shift = v_im4 + 2*(itid8/4); - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { - const uint y_idx = i * QUANT_K + y_offset; - - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - - uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0]; - uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1]; - uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2]; - uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3]; - uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4]; - uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5]; - u8vec2 s0 = unpack8(s0_16); - u8vec2 s2 = unpack8(s2_16); - u8vec2 s4 = unpack8(s4_16); - u8vec2 s6 = unpack8(s6_16); - u8vec2 s8 = unpack8(s8_16); - u8vec2 s10 = unpack8(s10_16); - - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - - vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]); - vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]); - vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]); - vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]); - vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]); - vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]); - vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]); - vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]); - - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < 2; ++l) { - sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum)))))))); - } - temp[j][n] = fma(d, sum, temp[j][n]); - } - } - } + const uint nbr_par_th = num_blocks_per_row%it_size; + const uint nbr_all_th = num_blocks_per_row - nbr_par_th; + uint i0 = 0; + [[unroll]] for (; i0 < nbr_all_th; i0 += it_size) + calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, true); + calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, false); reduce_result(temp, d_offset, first_row, num_rows, tid); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index f97eb8744..f9cde0648 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -6,6 +6,86 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; + +void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) { + const uint y1_idx = i * QUANT_K + y_offset; + const uint y2_idx = y1_idx + 128; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + vec2 d = vec2(data_a[ib0 + i].d); + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + + const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32; + const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2; + const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F)); + const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h)); + + const FLOAT_TYPE sc0 = scale_0_4_l_f.x; + const FLOAT_TYPE sc1 = scale_0_4_l_f.y; + const FLOAT_TYPE sc2 = scale_0_4_l_f.z; + const FLOAT_TYPE sc3 = scale_0_4_l_f.w; + const FLOAT_TYPE sc4 = scale8_f.x; + const FLOAT_TYPE sc5 = scale8_f.y; + const FLOAT_TYPE sc6 = scale8_f.z; + const FLOAT_TYPE sc7 = scale8_f.w; + + const uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; + const uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; + + const uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; + const uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; + const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F; + const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F; + + const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4)); + const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4)); + const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4)); + const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4)); + + const FLOAT_TYPE q4_0 = qs0_lo4.x; + const FLOAT_TYPE q4_1 = qs0_lo4.y; + const FLOAT_TYPE q4_2 = qs0_lo4.z; + const FLOAT_TYPE q4_3 = qs0_lo4.w; + const FLOAT_TYPE q4_4 = qs0_hi4.x; + const FLOAT_TYPE q4_5 = qs0_hi4.y; + const FLOAT_TYPE q4_6 = qs0_hi4.z; + const FLOAT_TYPE q4_7 = qs0_hi4.w; + const FLOAT_TYPE q4_8 = qs64_lo4.x; + const FLOAT_TYPE q4_9 = qs64_lo4.y; + const FLOAT_TYPE q4_10 = qs64_lo4.z; + const FLOAT_TYPE q4_11 = qs64_lo4.w; + const FLOAT_TYPE q4_12 = qs64_hi4.x; + const FLOAT_TYPE q4_13 = qs64_hi4.y; + const FLOAT_TYPE q4_14 = qs64_hi4.z; + const FLOAT_TYPE q4_15 = qs64_hi4.w; + + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]); + vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]); + vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]); + vec4 by232 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]); + + const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3))); + const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7))); + const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11))); + const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15))); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7, + fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7, + fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7, + fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7))))))))))))))); + temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n])); + } + } +} + void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -15,13 +95,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; - const uint itid = tid%16; // 0...16 - const uint ix = tid/16; + const uint itid = tid%16; // 0...15 + const uint ix = tid/16; - const uint step = 4; - - const uint il = itid/step; // 0...3 - const uint ir = itid - step*il; // 0...7 or 0...3 + const uint il = itid/4; // 0...3 + const uint ir = itid - 4*il; // 0...3 const uint n = 4; const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 @@ -31,89 +109,14 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint q_offset = 32*v_im + l0; const uint y_offset = 64*v_im + l0; - FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { temp[j][i] = FLOAT_TYPE(0); } } - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { - const uint y1_idx = i * QUANT_K + y_offset; - const uint y2_idx = y1_idx + 128; - - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - vec2 d = vec2(data_a[ib0 + i].d); - const FLOAT_TYPE dall = FLOAT_TYPE(d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - - uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; - uvec4 scale0 = uvec4(unpack8(scale0_u32)); - uvec4 scale4 = uvec4(unpack8(scale4_u32)); - uvec4 scale8 = uvec4(unpack8(scale8_u32)); - - const uint32_t sc0 = ( scale0.x & 0x3f); - const uint32_t sc1 = ( scale0.y & 0x3f); - const uint32_t sc2 = ( scale4.x & 0x3f); - const uint32_t sc3 = ( scale4.y & 0x3f); - const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); - const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); - const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); - const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - - uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; - uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; - - uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; - uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; - uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F; - uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F; - - uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4)); - uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4)); - uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4)); - uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4)); - - const uint32_t q4_0 = qs0_lo4.x; - const uint32_t q4_1 = qs0_lo4.y; - const uint32_t q4_2 = qs0_lo4.z; - const uint32_t q4_3 = qs0_lo4.w; - const uint32_t q4_4 = qs0_hi4.x; - const uint32_t q4_5 = qs0_hi4.y; - const uint32_t q4_6 = qs0_hi4.z; - const uint32_t q4_7 = qs0_hi4.w; - const uint32_t q4_8 = qs64_lo4.x; - const uint32_t q4_9 = qs64_lo4.y; - const uint32_t q4_10 = qs64_lo4.z; - const uint32_t q4_11 = qs64_lo4.w; - const uint32_t q4_12 = qs64_hi4.x; - const uint32_t q4_13 = qs64_hi4.y; - const uint32_t q4_14 = qs64_hi4.z; - const uint32_t q4_15 = qs64_hi4.w; - - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]); - vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]); - vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]); - vec4 by232 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]); - - const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3))); - const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7))); - const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11))); - const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15))); - const FLOAT_TYPE smin = - fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7, - fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7, - fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7, - fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7))))))))))))))); - temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n])); - } - } - } + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) + calc_superblock(a_offset, b_offset, v_im, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows); reduce_result(temp, d_offset, first_row, num_rows, tid); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index 79d7db0e3..6c84ef3cd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -6,6 +6,118 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; + +void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint l0, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) { + const uint y1_idx = i * QUANT_K + y_offset; + const uint y2_idx = y1_idx + 128; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + vec2 d = vec2(data_a[ib0 + i].d); + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + + const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32; + const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2; + const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F)); + const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h)); + + const FLOAT_TYPE sc0 = scale_0_4_l_f.x; + const FLOAT_TYPE sc1 = scale_0_4_l_f.y; + const FLOAT_TYPE sc2 = scale_0_4_l_f.z; + const FLOAT_TYPE sc3 = scale_0_4_l_f.w; + const FLOAT_TYPE sc4 = scale8_f.x; + const FLOAT_TYPE sc5 = scale8_f.y; + const FLOAT_TYPE sc6 = scale8_f.z; + const FLOAT_TYPE sc7 = scale8_f.w; + + const uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); + const uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); + + uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; + uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; + uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; + uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; + + const uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8])); + + const uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4; + const uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3; + const uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010); + const uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1; + + qs0_16_u32_lo4 += qs0_16_lo4_offset16; + qs0_16_u32_hi4 += qs0_16_hi4_offset16; + qs64_80_u32_lo4 += qs64_80_lo4_offset16; + qs64_80_u32_hi4 += qs64_80_hi4_offset16; + + const vec4 qs0_16_lo4 = vec4(unpack8(qs0_16_u32_lo4)); + const vec4 qs64_80_lo4 = vec4(unpack8(qs64_80_u32_lo4)); + const vec4 qs0_16_hi4 = vec4(unpack8(qs0_16_u32_hi4)); + const vec4 qs64_80_hi4 = vec4(unpack8(qs64_80_u32_hi4)); + + const FLOAT_TYPE q4_0 = qs0_16_lo4.x; + const FLOAT_TYPE q4_1 = qs0_16_lo4.y; + const FLOAT_TYPE q4_2 = qs0_16_lo4.z; + const FLOAT_TYPE q4_3 = qs0_16_lo4.w; + const FLOAT_TYPE q4_4 = qs0_16_hi4.x; + const FLOAT_TYPE q4_5 = qs0_16_hi4.y; + const FLOAT_TYPE q4_6 = qs0_16_hi4.z; + const FLOAT_TYPE q4_7 = qs0_16_hi4.w; + const FLOAT_TYPE q4_8 = qs64_80_lo4.x; + const FLOAT_TYPE q4_9 = qs64_80_lo4.y; + const FLOAT_TYPE q4_10 = qs64_80_lo4.z; + const FLOAT_TYPE q4_11 = qs64_80_lo4.w; + const FLOAT_TYPE q4_12 = qs64_80_hi4.x; + const FLOAT_TYPE q4_13 = qs64_80_hi4.y; + const FLOAT_TYPE q4_14 = qs64_80_hi4.z; + const FLOAT_TYPE q4_15 = qs64_80_hi4.w; + + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + vec2 by10 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 ]); + vec2 by116 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8]); + vec2 by132 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]); + vec2 by148 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]); + vec2 by20 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 ]); + vec2 by216 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8]); + vec2 by232 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]); + vec2 by248 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]); + + const FLOAT_TYPE sx = + fma(FLOAT_TYPE(by10.x), q4_0, + fma(FLOAT_TYPE(by10.y), q4_1, + fma(FLOAT_TYPE(by116.x), q4_2, + FLOAT_TYPE(by116.y) * q4_3))); + const FLOAT_TYPE sy = + fma(FLOAT_TYPE(by132.x), q4_4, + fma(FLOAT_TYPE(by132.y), q4_5, + fma(FLOAT_TYPE(by148.x), q4_6, + FLOAT_TYPE(by148.y) * q4_7))); + const FLOAT_TYPE sz = + fma(FLOAT_TYPE(by20.x), q4_8, + fma(FLOAT_TYPE(by20.y), q4_9, + fma(FLOAT_TYPE(by216.x), q4_10, + FLOAT_TYPE(by216.y) * q4_11))); + const FLOAT_TYPE sw = + fma(FLOAT_TYPE(by232.x), q4_12, + fma(FLOAT_TYPE(by232.y), q4_13, + fma(FLOAT_TYPE(by248.x), q4_14, + FLOAT_TYPE(by248.y) * q4_15))); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, + fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, + fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, + (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); + temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n])); + } + } +} + void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -15,11 +127,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; - const uint itid = tid%16; // 0...16 - const uint ix = tid/16; + const uint itid = tid%16; // 0...15 + const uint ix = tid/16; const uint il = itid/4; // 0...3 - const uint ir = itid - 4*il; // 0...7 or 0...3 + const uint ir = itid - 4*il; // 0...3 const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const uint v_in = il % 2; @@ -28,121 +140,14 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint q_offset = 32*v_im + l0; const uint y_offset = 64*v_im + l0; - FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { temp[j][i] = FLOAT_TYPE(0); } } - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { - const uint y1_idx = i * QUANT_K + y_offset; - const uint y2_idx = y1_idx + 128; - - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - vec2 d = vec2(data_a[ib0 + i].d); - const FLOAT_TYPE dall = FLOAT_TYPE(d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - - uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; - uvec4 scale0 = uvec4(unpack8(scale0_u32)); - uvec4 scale4 = uvec4(unpack8(scale4_u32)); - uvec4 scale8 = uvec4(unpack8(scale8_u32)); - - const uint32_t sc0 = ( scale0.x & 0x3f); - const uint32_t sc1 = ( scale0.y & 0x3f); - const uint32_t sc2 = ( scale4.x & 0x3f); - const uint32_t sc3 = ( scale4.y & 0x3f); - const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); - const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); - const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); - const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - - uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); - uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); - - uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; - uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; - uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; - uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; - - uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8])); - - uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4; - uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3; - uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0; - uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1; - - qs0_16_u32_lo4 += qs0_16_lo4_offset16; - qs0_16_u32_hi4 += qs0_16_hi4_offset16; - qs64_80_u32_lo4 += qs64_80_lo4_offset16; - qs64_80_u32_hi4 += qs64_80_hi4_offset16; - - uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4)); - uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4)); - uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4)); - uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4)); - - const uint32_t q4_0 = qs0_16_lo4.x; - const uint32_t q4_1 = qs0_16_lo4.y; - const uint32_t q4_2 = qs0_16_lo4.z; - const uint32_t q4_3 = qs0_16_lo4.w; - const uint32_t q4_4 = qs0_16_hi4.x; - const uint32_t q4_5 = qs0_16_hi4.y; - const uint32_t q4_6 = qs0_16_hi4.z; - const uint32_t q4_7 = qs0_16_hi4.w; - const uint32_t q4_8 = qs64_80_lo4.x; - const uint32_t q4_9 = qs64_80_lo4.y; - const uint32_t q4_10 = qs64_80_lo4.z; - const uint32_t q4_11 = qs64_80_lo4.w; - const uint32_t q4_12 = qs64_80_hi4.x; - const uint32_t q4_13 = qs64_80_hi4.y; - const uint32_t q4_14 = qs64_80_hi4.z; - const uint32_t q4_15 = qs64_80_hi4.w; - - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec2 by10 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 ]); - vec2 by116 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8]); - vec2 by132 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]); - vec2 by148 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]); - vec2 by20 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 ]); - vec2 by216 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8]); - vec2 by232 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]); - vec2 by248 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]); - - const FLOAT_TYPE sx = - fma(FLOAT_TYPE(by10.x), q4_0, - fma(FLOAT_TYPE(by10.y), q4_1, - fma(FLOAT_TYPE(by116.x), q4_2, - FLOAT_TYPE(by116.y) * q4_3))); - const FLOAT_TYPE sy = - fma(FLOAT_TYPE(by132.x), q4_4, - fma(FLOAT_TYPE(by132.y), q4_5, - fma(FLOAT_TYPE(by148.x), q4_6, - FLOAT_TYPE(by148.y) * q4_7))); - const FLOAT_TYPE sz = - fma(FLOAT_TYPE(by20.x), q4_8, - fma(FLOAT_TYPE(by20.y), q4_9, - fma(FLOAT_TYPE(by216.x), q4_10, - FLOAT_TYPE(by216.y) * q4_11))); - const FLOAT_TYPE sw = - fma(FLOAT_TYPE(by232.x), q4_12, - fma(FLOAT_TYPE(by232.y), q4_13, - fma(FLOAT_TYPE(by248.x), q4_14, - FLOAT_TYPE(by248.y) * q4_15))); - const FLOAT_TYPE smin = - fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, - fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, - fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, - (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); - temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n])); - } - } - } + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) + calc_superblock(a_offset, b_offset, v_im, l0, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows); reduce_result(temp, d_offset, first_row, num_rows, tid); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 041fd27c1..f05f96b5e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -6,7 +6,77 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { +shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; + +FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; + +void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) { + const uint y_idx = i * QUANT_K + y_offset; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + + if (!all_threads) { // when we don't have enough blocks to use all threads + barrier(); + if (i < num_blocks_per_row) + sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]); + barrier(); + + if (i >= num_blocks_per_row) + continue; + } + + const uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); + const uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); + + const uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; + const uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; + const uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; + const uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; + + const uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); + const uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; + const uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; + const uint32_t qh4_u32 = (qh_u32 & 0x30303030); + const uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; + + const uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; + const uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; + const uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; + const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; + + const vec4 q0 = vec4(unpack8(q0_u32)) - 32; + const vec4 q1 = vec4(unpack8(q1_u32)) - 32; + const vec4 q2 = vec4(unpack8(q2_u32)) - 32; + const vec4 q3 = vec4(unpack8(q3_u32)) - 32; + + if (all_threads) { + barrier(); + sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]); + barrier(); + } + + const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + vec4 by0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 ]); + vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]); + vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]); + vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]); + + FLOAT_TYPE sum[4] = {0, 0, 0, 0}; + [[unroll]] for (uint l = 0; l < 4; ++l) { + sum[0] = fma(FLOAT_TYPE(by0[l]), q0[l], sum[0]); + sum[1] = fma(FLOAT_TYPE(by32[l]), q1[l], sum[1]); + sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]); + sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]); + } + temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]); + } + } +} + +void compute_outputs(const uint first_row, const uint num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -15,13 +85,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; const uint tid = gl_LocalInvocationID.x; - const uint itid = tid%16; // 0...16 - const uint ix = tid/16; + const uint itid = tid%16; // 0...15 + const uint ix = tid/16; - const uint step = 8; - - const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = itid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - 8*v_im; // 0...7 const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28 const uint is = v_in / 4; @@ -31,68 +99,18 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; - FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { temp[j][i] = FLOAT_TYPE(0); } } - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { - const uint y_idx = i * QUANT_K + y_offset; - - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - - FLOAT_TYPE scales[4]; - scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); - scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); - scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); - scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); - - uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); - - uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; - uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; - uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; - uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - - uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); - uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; - uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; - uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; - uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; - - uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; - uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; - uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; - uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; - - uvec4 q0 = uvec4(unpack8(q0_u32)); - uvec4 q1 = uvec4(unpack8(q1_u32)); - uvec4 q2 = uvec4(unpack8(q2_u32)); - uvec4 q3 = uvec4(unpack8(q3_u32)); - - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec4 by0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 ]); - vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]); - vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]); - vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]); - - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < 4; ++l) { - sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), - fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), - fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), - fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); - } - temp[j][n] += sum * d; - } - } - } + const uint nbr_par_th = num_blocks_per_row%it_size; + const uint nbr_all_th = num_blocks_per_row - nbr_par_th; + uint i0 = 0; + [[unroll]] for (; i0 < nbr_all_th; i0 += it_size) + calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true); + calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false); reduce_result(temp, d_offset, first_row, num_rows, tid); } From c67cc9837d48ea7f612b5666b90d189e63dfd7d3 Mon Sep 17 00:00:00 2001 From: fj-y-saito <85871716+fj-y-saito@users.noreply.github.com> Date: Thu, 16 Jan 2025 18:11:49 +0900 Subject: [PATCH 08/30] ggml: aarch64: implement SVE kernels for q4_K_q8_K vector dot (#11227) * Add SVE support for q4_K_q8_K * Update ggml/src/ggml-cpu/ggml-cpu-quants.c change to use K_SCALE_SIZE Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- ggml/src/ggml-cpu/ggml-cpu-quants.c | 83 ++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 8e1472266..88303ff0e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -5573,7 +5573,88 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r uint32_t utmp[4]; -#ifdef __ARM_NEON +#ifdef __ARM_FEATURE_SVE + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + + const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); + + memcpy(utmp, x[i].scales, K_SCALE_SIZE); + + uint32x2_t mins8 = { 0 }; + mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); + mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); + + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[0] &= kmask1; + + const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8))); + const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), + vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); + sumf -= dmin * vaddvq_s32(prod); + + const uint8_t * scales = (const uint8_t *)utmp; + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + const int vector_length = ggml_cpu_get_sve_cnt()*8; + const svuint8_t m4b = svdup_n_u8(0xf); + const svint32_t mzero = svdup_n_s32(0); + svint32_t sumi1 = svdup_n_s32(0); + svint32_t sumi1_1 = svdup_n_s32(0); + svint32_t sumi1_2 = svdup_n_s32(0); + svint32_t sumi2 = svdup_n_s32(0); + svint32_t sumi2_1 = svdup_n_s32(0); + svint32_t sumi2_2 = svdup_n_s32(0); + switch (vector_length) { + case 128: + { + for (int j = 0; j < QK_K/64; ++j) { + svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b)); + svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); + q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b)); + q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); + + q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4)); + q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); + q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4)); + q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); + q4 += 32; + } + sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2); + sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2); + sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2))); + } break; + case 256: + case 512: + { + for (int j = 0; j < QK_K/64; ++j) { + const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32; + svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b)); + svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32; + sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); + + q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4)); + q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32; + sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); + } + sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2))); + } break; + default: + assert(false && "Unsupported vector length"); + break; + } + } + *s = sumf; +#elif __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); const int32x4_t mzero = vdupq_n_s32(0); From 681149ced2582f48c24792403df1307fed5eb951 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 16 Jan 2025 13:54:08 +0100 Subject: [PATCH 09/30] llama : add `llama_model_load_from_splits` (#11255) * llama : add `llama_model_load_from_splits` * update --- include/llama.h | 10 ++++++ src/llama-model-loader.cpp | 74 ++++++++++++++++++++++++++++++++------ src/llama-model-loader.h | 7 +++- src/llama-quant.cpp | 3 +- src/llama.cpp | 46 ++++++++++++++++++------ 5 files changed, 116 insertions(+), 24 deletions(-) diff --git a/include/llama.h b/include/llama.h index a184884c7..352c3417e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -418,10 +418,20 @@ extern "C" { struct llama_model_params params), "use llama_model_load_from_file instead"); + // Load the model from a file + // If the file is split into multiple parts, the file name must follow this pattern: -%05d-of-%05d.gguf + // If the split file name does not follow this pattern, use llama_model_load_from_splits LLAMA_API struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params); + // Load the model from multiple splits (support custom naming scheme) + // The paths must be in the correct order + LLAMA_API struct llama_model * llama_model_load_from_splits( + const char ** paths, + size_t n_paths, + struct llama_model_params params); + DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model), "use llama_model_free instead"); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 53175f0e0..75073bf61 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -64,6 +64,33 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { } } +// return a list of splits for a given path +// for example, given "-00002-of-00004.gguf", returns list of all 4 splits +static std::vector llama_get_list_splits(const std::string & path, const int idx, const int n_split) { + std::vector paths; + std::string split_prefix; + std::vector buf(llama_path_max(), 0); + + { + int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split); + if (!ret) { + throw std::runtime_error(format("invalid split file name: %s", path.c_str())); + } + split_prefix = std::string(buf.data(), ret); + } + + if (split_prefix.empty()) { + throw std::runtime_error(format("invalid split file: %s", path.c_str())); + } + + for (int idx = 0; idx < n_split; ++idx) { + int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split); + paths.push_back(std::string(buf.data(), ret)); + } + + return paths; +} + namespace GGUFMeta { template struct GKV_Base_Type { @@ -413,7 +440,12 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); -llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { +llama_model_loader::llama_model_loader( + const std::string & fname, + std::vector & splits, + bool use_mmap, + bool check_tensors, + const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -425,6 +457,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, } } + // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, @@ -460,35 +493,54 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, // Load additional GGML contexts if (n_split > 1) { + // make sure the main file is loaded first uint16_t idx = 0; - get_key(llm_kv(LLM_KV_SPLIT_NO), idx); + const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); + get_key(kv_split_no, idx); if (idx != 0) { - throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); + throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); } - std::vector split_prefix(llama_path_max(), 0); - if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) { - throw std::runtime_error(format("invalid split file: %s", fname.c_str())); + // generate list of splits if needed + if (splits.empty()) { + splits = llama_get_list_splits(fname, idx, n_split); + } + + // in case user give a custom list of splits, check if it matches the expected number + if (n_split != (uint16_t)splits.size()) { + throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); } if (trace > 0) { LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); } - std::vector split_path(llama_path_max(), 0); + // load other splits for (idx = 1; idx < n_split; idx++) { - llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split); + const char * fname_split = splits[idx].c_str(); struct gguf_init_params split_params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) }; + gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; if (!ctx_gguf) { - throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data())); + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split)); } - files.emplace_back(new llama_file(split_path.data(), "rb")); + // check idx + { + const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); + if (kid < 0) { + throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); + } + int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); + if (idx_gguf != idx) { + throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); + } + } + + files.emplace_back(new llama_file(fname_split, "rb")); contexts.emplace_back(ctx); // Save tensors data offset info of the shard. diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index b63d158d9..fe35404b2 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -90,7 +90,12 @@ struct llama_model_loader { size_t size_data = 0; std::vector> mmaps_used; - llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p); + llama_model_loader( + const std::string & fname, + std::vector & splits, // optional, only need if the split does not follow naming scheme + bool use_mmap, + bool check_tensors, + const struct llama_model_kv_override * param_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d4947a780..fb7982655 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -526,7 +526,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: kv_overrides = v->data(); } - llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides); + std::vector splits = {}; + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 2e391b3b6..fede23d19 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -31,7 +31,7 @@ #endif // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { +static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); ml.print_info(); @@ -9374,14 +9374,9 @@ int64_t llama_time_us(void) { return ggml_time_us(); } -struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_model_params params) { - return llama_model_load_from_file(path_model, params); -} - -struct llama_model * llama_model_load_from_file( - const char * path_model, +static struct llama_model * llama_model_load_from_file_impl( + const std::string & path_model, + std::vector & splits, struct llama_model_params params) { ggml_time_init(); @@ -9485,7 +9480,7 @@ struct llama_model * llama_model_load_from_file( LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); } - const int status = llama_model_load(path_model, *model, params); + const int status = llama_model_load(path_model, splits, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -9501,6 +9496,35 @@ struct llama_model * llama_model_load_from_file( return model; } +// deprecated +struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_model_params params) { + return llama_model_load_from_file(path_model, params); +} + +struct llama_model * llama_model_load_from_file( + const char * path_model, + struct llama_model_params params) { + std::vector splits = {}; + return llama_model_load_from_file_impl(path_model, splits, params); +} + +struct llama_model * llama_model_load_from_splits( + const char ** paths, + size_t n_paths, + struct llama_model_params params) { + std::vector splits; + if (n_paths == 0) { + LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); + return nullptr; + } + for (size_t i = 0; i < n_paths; ++i) { + splits.push_back(paths[i]); + } + return llama_model_load_from_file_impl(splits.front(), splits, params); +} + struct llama_context * llama_init_from_model( struct llama_model * model, struct llama_context_params params) { From 9c8dcefe171ba70ed14f2a64d1433da12d0dd1c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 16 Jan 2025 16:43:38 +0100 Subject: [PATCH 10/30] CUDA: backwards pass for misc. ops, add tests (#11257) * CUDA: backwards pass for misc. ops, add tests * remove restrict from pointers --- ggml/include/ggml.h | 12 +- ggml/src/ggml-alloc.c | 5 + ggml/src/ggml-cpu/ggml-cpu.c | 133 ++++++----- ggml/src/ggml-cpu/ggml-cpu.cpp | 10 + ggml/src/ggml-cuda/cross-entropy-loss.cu | 175 ++++++++------- ggml/src/ggml-cuda/getrows.cu | 157 ++++++++----- ggml/src/ggml-cuda/getrows.cuh | 3 + ggml/src/ggml-cuda/ggml-cuda.cu | 27 ++- ggml/src/ggml-cuda/norm.cu | 155 ++++++++++--- ggml/src/ggml-cuda/norm.cuh | 2 + ggml/src/ggml-cuda/out-prod.cu | 38 +++- ggml/src/ggml-cuda/rope.cu | 48 ++-- ggml/src/ggml-cuda/softmax.cu | 134 ++++++++--- ggml/src/ggml-cuda/softmax.cuh | 2 + ggml/src/ggml-cuda/unary.cu | 36 +++ ggml/src/ggml-cuda/unary.cuh | 3 + ggml/src/ggml.c | 49 ++-- tests/test-backend-ops.cpp | 273 ++++++++++++++++++++--- 18 files changed, 930 insertions(+), 332 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a9c051cd5..1198dc1fd 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1384,16 +1384,20 @@ extern "C" { float scale, float max_bias); - GGML_API struct ggml_tensor * ggml_soft_max_back( + GGML_API struct ggml_tensor * ggml_soft_max_ext_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + float scale, + float max_bias); // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + float scale, + float max_bias); // rotary position embedding // if (mode & 1) - skip n_past elements (NOT SUPPORTED) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 8dc8226ac..9a3bf9f29 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -37,6 +37,7 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml return true; } +// ops that return true for this function must not use restrict pointers for their backend implementations static bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { case GGML_OP_SCALE: @@ -52,8 +53,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_LOG: case GGML_OP_UNARY: case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_SILU_BACK: case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: return true; default: diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7c2e45f86..dd9995562 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -6691,20 +6691,20 @@ static void ggml_compute_forward_silu_back_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * grad = dst->src[1]; + const struct ggml_tensor * grad = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; assert(ggml_is_contiguous_1(grad)); - assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(src1)); assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - assert(ggml_are_same_shape(src0, grad)); + assert(ggml_are_same_shape(src1, dst)); + assert(ggml_are_same_shape(src1, grad)); const int ith = params->ith; const int nth = params->nth; - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + const int nc = src1->ne[0]; + const int nr = ggml_nrows(src1); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -6716,7 +6716,7 @@ static void ggml_compute_forward_silu_back_f32( for (int i1 = ir0; i1 < ir1; i1++) { ggml_vec_silu_backward_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1])), + (float *) ((char *) src1->data + i1*(src1->nb[1])), (float *) ((char *) grad->data + i1*(grad->nb[1]))); #ifndef NDEBUG @@ -6895,7 +6895,7 @@ static void ggml_compute_forward_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - GGML_ASSERT(eps > 0.0f); + GGML_ASSERT(eps >= 0.0f); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -6966,7 +6966,7 @@ static void ggml_compute_forward_rms_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - GGML_ASSERT(eps > 0.0f); + GGML_ASSERT(eps >= 0.0f); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -7018,12 +7018,13 @@ static void ggml_compute_forward_rms_norm_back_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output + const struct ggml_tensor * src1 = dst->src[1]; // src1 from forward pass GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -7042,8 +7043,8 @@ static void ggml_compute_forward_rms_norm_back_f32( const int64_t i12 = i02; const int64_t i13 = i03; - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); + const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); ggml_float sum_xx = 0.0; ggml_float sum_xdz = 0.0; @@ -7066,9 +7067,9 @@ static void ggml_compute_forward_rms_norm_back_f32( { // z = rms_norm(x) // - // rms_norm(src0) = + // rms_norm(src1) = // scale( - // src0, + // src1, // div( // 1, // sqrt( @@ -7076,13 +7077,13 @@ static void ggml_compute_forward_rms_norm_back_f32( // scale( // sum( // sqr( - // src0)), + // src1)), // (1.0/N)), // eps)))); // postorder: // ## op args grad - // 00 param src0 grad[#00] + // 00 param src1 grad[#00] // 01 const 1 // 02 sqr (#00) grad[#02] // 03 sum (#02) grad[#03] @@ -7159,6 +7160,7 @@ static void ggml_compute_forward_rms_norm_back_f32( // dx := scale(dx, rrms) float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + // dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps) ggml_vec_cpy_f32 (ne00, dx, x); // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); @@ -7750,12 +7752,13 @@ static void ggml_compute_forward_out_prod_f32( const int ith = params->ith; const int nth = params->nth; - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne3 == ne13); - GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + GGML_ASSERT(ne2 % ne02 == 0); + GGML_ASSERT(ne3 % ne03 == 0); // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == sizeof(float)); @@ -7797,6 +7800,10 @@ static void ggml_compute_forward_out_prod_f32( const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32); const int64_t blck_1 = 16; + // dps == dst per src0, used for group query attention + const int64_t dps2 = ne2 / ne02; + const int64_t dps3 = ne3 / ne03; + for (int64_t bir = ir0; bir < ir1; bir += blck_1) { const int64_t bir1 = MIN(bir + blck_1, ir1); for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { @@ -7807,8 +7814,8 @@ static void ggml_compute_forward_out_prod_f32( const int64_t i2 = (ir - i3*ne2*ne1)/ne1; const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); - const int64_t i02 = i2; - const int64_t i03 = i3; + const int64_t i02 = i2 / dps2; + const int64_t i03 = i3 / dps3; //const int64_t i10 = i1; const int64_t i12 = i2; @@ -8906,9 +8913,9 @@ static void ggml_compute_forward_soft_max( } -// ggml_compute_forward_soft_max_back +// ggml_compute_forward_soft_max_ext_back -static void ggml_compute_forward_soft_max_back_f32( +static void ggml_compute_forward_soft_max_ext_back_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8921,6 +8928,14 @@ static void ggml_compute_forward_soft_max_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src1, dst)); + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + + GGML_ASSERT(max_bias == 0.0f); + // TODO: handle transposed/permuted matrices const int ith = params->ith; @@ -8969,10 +8984,11 @@ static void ggml_compute_forward_soft_max_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); - ggml_vec_cpy_f32 (nc, dx, dy); - ggml_vec_acc1_f32(nc, dx, -dot_y_dy); - ggml_vec_mul_f32 (nc, dx, dx, y); + ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); + ggml_vec_cpy_f32 (nc, dx, dy); + ggml_vec_acc1_f32 (nc, dx, -dot_y_dy); + ggml_vec_mul_f32 (nc, dx, dx, y); + ggml_vec_scale_f32(nc, dx, scale); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -8983,7 +8999,7 @@ static void ggml_compute_forward_soft_max_back_f32( } } -static void ggml_compute_forward_soft_max_back( +static void ggml_compute_forward_soft_max_ext_back( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8992,7 +9008,7 @@ static void ggml_compute_forward_soft_max_back( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_back_f32(params, dst); + ggml_compute_forward_soft_max_ext_back_f32(params, dst); } break; default: { @@ -9985,9 +10001,10 @@ static void ggml_compute_forward_im2col_back_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output + const struct ggml_tensor * src1 = dst->src[1]; // convolution kernel + GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -10009,11 +10026,11 @@ static void ggml_compute_forward_im2col_back_f32( const int64_t IH = is_2D ? ne1 : 1; const int64_t IW = ne0; - const int64_t KH = is_2D ? ne01 : 1; - const int64_t KW = ne00; + const int64_t KH = is_2D ? ne11 : 1; + const int64_t KW = ne10; - const int64_t OH = is_2D ? ne12 : 1; - const int64_t OW = ne11; + const int64_t OH = is_2D ? ne02 : 1; + const int64_t OW = ne01; int ofs0 = is_2D ? nb3 : nb2; int ofs1 = is_2D ? nb2 : nb1; @@ -10059,9 +10076,9 @@ static void ggml_compute_forward_im2col_back_f32( continue; } - const float * const src_data = (const float *) src1->data + const float * const grad_in = (const float *) src0->data + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - grad += src_data[iic*(KH*KW) + ikh*KW + ikw]; + grad += grad_in[iic*(KH*KW) + ikh*KW + ikw]; } } float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW] @@ -12484,22 +12501,22 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * opt0 = dst->src[2]; + const struct ggml_tensor * grad = dst->src[0]; // gradient of forward pass output + const struct ggml_tensor * src0f = dst->src[1]; // src0 of forward pass + const struct ggml_tensor * src1f = dst->src[2]; // src1 of forward pass GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(opt0)); - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(src0f)); + GGML_ASSERT(ggml_is_contiguous(src1f)); + GGML_ASSERT(ggml_is_contiguous(grad)); + GGML_ASSERT(ggml_are_same_shape(src0f, src1f) && ggml_are_same_shape(src0f, dst)); const int64_t ith = params->ith; const int64_t nth = params->nth; // TODO: handle transposed/permuted matrices - const int64_t nc = src0->ne[0]; - const int64_t nr = ggml_nrows(src0); + const int64_t nc = src0f->ne[0]; + const int64_t nr = ggml_nrows(src0f); // rows per thread const int64_t dr = (nr + nth - 1)/nth; @@ -12508,12 +12525,12 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ir0 = dr*ith; const int64_t ir1 = MIN(ir0 + dr, nr); - const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr; + const float d_by_nr = ((const float *) grad->data)[0] / (float) nr; for (int64_t i1 = ir0; i1 < ir1; i1++) { - float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); - float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); - float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); + const float * s0 = (const float *)((const char *) src0f->data + i1*src0f->nb[1]); + const float * s1 = (const float *)((const char *) src1f->data + i1*src1f->nb[1]); #ifndef NDEBUG for (int64_t i = 0; i < nc; ++i) { @@ -12526,11 +12543,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( // soft_max float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); + const ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); assert(sum > 0.0); ggml_vec_scale_f32(nc, ds0, 1.0/sum); - // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr + // grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr ggml_vec_sub_f32(nc, ds0, ds0, s1); ggml_vec_scale_f32(nc, ds0, d_by_nr); @@ -12827,7 +12844,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SOFT_MAX_BACK: { - ggml_compute_forward_soft_max_back(params, tensor); + ggml_compute_forward_soft_max_ext_back(params, tensor); } break; case GGML_OP_ROPE: { diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 5c47ceb73..35a1c876c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -403,6 +403,16 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type; + case GGML_OP_SOFT_MAX_BACK: { + if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) { + return false; + } + float max_bias = 0.0f; + + memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float)); + + return max_bias == 0.0f; + } case GGML_OP_IM2COL_BACK: return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; case GGML_OP_OUT_PROD: diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index ed09406a8..27599a2b0 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -5,95 +5,89 @@ #include #include -static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) { - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE; +template +static __global__ void cross_entropy_loss_f32( + const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) { + extern __shared__ float tmp[]; - const int ne_tmp = WARP_SIZE*nclasses; - - extern __shared__ float tmp_all[]; - float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp; - float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp; - - // Each warp first loads ne_tmp logits/labels into shared memory: - for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) { - const int ig = i0*nclasses + i; // ig == i global - - tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f; - tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f; - } - - // Each thread in the warp then calculates the cross entropy loss for a single row. - // TODO: pad in order to avoid shared memory bank conflicts. + logits += int64_t(blockIdx.x)*nclasses; + labels += int64_t(blockIdx.x)*nclasses; // Find maximum for softmax: - float max = -INFINITY; - for (int i = 0; i < nclasses; ++i) { - max = fmaxf(max, tmp_logits[lane_id*nclasses + i]); + float max_logit = -INFINITY; + for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { + const float val = logits[i]; + max_logit = fmaxf(max_logit, val); + + if (use_shared) { + tmp[i] = val; + } } + max_logit = warp_reduce_max(max_logit); // Calculate log(softmax(logits)) which is just logits - max: float sum = 0.0f; - for (int i = 0; i < nclasses; ++i) { - float val = tmp_logits[lane_id*nclasses + i] - max; - sum += expf(val); - tmp_logits[lane_id*nclasses + i] = val; + for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { + const float logit_i = use_shared ? tmp[i] : logits[i]; + sum += expf(logit_i - max_logit); } + sum = warp_reduce_sum(sum); sum = logf(sum); // log(exp(logits - max) / sum) = (logits - max) - log(sum) float loss = 0.0f; - for (int i = 0; i < nclasses; ++i) { - loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i]; + for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { + const float logit_i = use_shared ? tmp[i] : logits[i]; + loss += (logit_i - max_logit - sum) * labels[i]; } loss = -warp_reduce_sum(loss) / (float)k; - __syncthreads(); - - if (lane_id == 0) { - tmp_all[warp_id] = loss; - } - - __syncthreads(); - - if (warp_id != 0) { - return; - } - - loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f; - loss = warp_reduce_sum(loss); - - if (lane_id != 0) { + if (threadIdx.x != 0) { return; } dst[blockIdx.x] = loss; } -static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) { +template +static __global__ void cross_entropy_loss_back_f32( + const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels, + float * __restrict__ dst, const int nclasses) { extern __shared__ float tmp[]; + logits += int64_t(blockIdx.x)*nclasses; + labels += int64_t(blockIdx.x)*nclasses; + dst += int64_t(blockIdx.x)*nclasses; + float maxval = -INFINITY; for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { - const float val = logits[blockIdx.x*nclasses + i]; + const float val = logits[i]; maxval = fmaxf(maxval, val); - tmp[i] = val; + + if (use_shared) { + tmp[i] = val; + } } maxval = warp_reduce_max(maxval); float sum = 0.0f; for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { - const float val = expf(tmp[i] - maxval); + const float val = expf((use_shared ? tmp[i] : logits[i]) - maxval); sum += val; - tmp[i] = val; + + if (use_shared) { + tmp[i] = val; + } else { + dst[i] = val; + } } sum = warp_reduce_sum(sum); const float sm_scale = 1.0f/sum; - const float d_by_nrows = *loss/gridDim.x; + const float d_by_nrows = *grad/gridDim.x; for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { - dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows; + const float val = use_shared ? tmp[i] : dst[i]; + dst[i] = (val*sm_scale - labels[i])*d_by_nrows; } } @@ -119,48 +113,77 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_cuda_pool & pool = ctx.pool(); cudaStream_t stream = ctx.stream(); - const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1); - const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1); - const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float); + const dim3 blocks_dim(WARP_SIZE, 1, 1); + const dim3 blocks_num(nrows, 1, 1); + const size_t nbytes_shared = ne00*sizeof(float); + + const int id = ggml_cuda_get_device(); + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; ggml_cuda_pool_alloc dst_tmp(pool, blocks_num.x); - cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); + if (nbytes_shared <= smpbo) { +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo)); + shared_memory_limit_raised[id] = true; + } +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) + cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); + } else { + cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); + } + CUDA_CHECK(cudaGetLastError()); // Combine results from individual blocks: sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream); } void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const ggml_tensor * opt0 = dst->src[2]; + const ggml_tensor * grad = dst->src[0]; + const ggml_tensor * src0f = dst->src[1]; + const ggml_tensor * src1f = dst->src[2]; - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(opt0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0f->type == GGML_TYPE_F32); + GGML_ASSERT(src1f->type == GGML_TYPE_F32); + GGML_ASSERT( grad->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_scalar(grad)); + GGML_ASSERT(ggml_is_contiguous(src0f)); + GGML_ASSERT(ggml_is_contiguous(src1f)); GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, src1)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src0f, src1f)); + GGML_ASSERT(ggml_are_same_shape(src0f, dst)); - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ne00 = src0f->ne[0]; + const int64_t nrows = ggml_nrows(src0f); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - const float * opt0_d = (const float *) opt0->data; - float * dst_d = (float *) dst->data; + const float * grad_d = (const float *) grad->data; + const float * src0f_d = (const float *) src0f->data; + const float * src1f_d = (const float *) src1f->data; + float * dst_d = (float *) dst->data; cudaStream_t stream = ctx.stream(); const dim3 blocks_dim(WARP_SIZE, 1, 1); const dim3 blocks_num(nrows, 1, 1); - const int shmem = ne00*sizeof(float); + const size_t nbytes_shared = ne00*sizeof(float); - cross_entropy_loss_back_f32<<>>(src0_d, src1_d, opt0_d, dst_d, ne00); + const int id = ggml_cuda_get_device(); + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; + + if (nbytes_shared <= smpbo) { +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo)); + shared_memory_limit_raised[id] = true; + } +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) + cross_entropy_loss_back_f32<<>>(grad_d, src0f_d, src1f_d, dst_d, ne00); + } else { + cross_entropy_loss_back_f32<<>>(grad_d, src0f_d, src1f_d, dst_d, ne00); + } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 4c3703238..4cef53a98 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -3,15 +3,15 @@ template static __global__ void k_get_rows( - const void * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, + const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ + /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/ + /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, + /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, + const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; @@ -22,10 +22,10 @@ static __global__ void k_get_rows( const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; + const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03; - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index const int iybs = i00 - i00%qk; // dst block start index const int y_offset = qr == 1 ? 1 : qk/2; @@ -39,15 +39,15 @@ static __global__ void k_get_rows( template static __global__ void k_get_rows_float( - const src0_t * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, + const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ + /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/ + /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, + /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, + const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - const int i00 = blockIdx.x*blockDim.x + threadIdx.x; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i00 = blockIdx.x*blockDim.x + threadIdx.x; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; @@ -58,14 +58,38 @@ static __global__ void k_get_rows_float( const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); dst_row[i00] = src0_row[i00]; } +template +static __global__ void k_get_rows_back_float( + const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) { + const int col = blockIdx.x*blockDim.x + threadIdx.x; + + if (col >= ncols) { + return; + } + + const int dst_row = blockIdx.y*blockDim.y + threadIdx.y; + + float sum = 0.0f; + + for (int64_t i = 0; i < nrows_grad; ++i) { + if (rows[i] != dst_row) { + continue; + } + sum += grad[i*ncols + col]; + } + + dst[dst_row*ncols + col] = sum; +} + template -static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { +static void get_rows_cuda( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { GGML_TENSOR_BINARY_OP_LOCALS @@ -87,22 +111,25 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg GGML_ASSERT(ne00 % 2 == 0); k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, - ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ - /* s0,*/ s1, s2, s3, - /* nb00,*/ nb01, nb02, nb03, - s10, s11, s12/*, s13*/); + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); GGML_UNUSED(dst); } template -static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { +static void get_rows_cuda_float( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(ne13 == 1); + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const dim3 block_nums(block_num_x, ne10, ne11*ne12); @@ -119,12 +146,12 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr //const size_t s13 = nb13 / ggml_element_size(src1); k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, - ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ - /* s0,*/ s1, s2, s3, - /* nb00,*/ nb01, nb02, nb03, - s10, s11, s12/*, s13*/); + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); GGML_UNUSED(dst); } @@ -132,42 +159,41 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - float * dst_d = (float *)dst->data; + + const void * src0_d = (const void *) src0->data; + const int32_t * src1_d = (const int32_t *) src1->data; + float * dst_d = (float *) dst->data; + cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - - const int32_t * src1_i32 = (const int32_t *) src1_d; + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); switch (src0->type) { case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); + get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream); break; case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream); break; case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); break; case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); break; case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); break; case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); break; case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); break; default: // TODO: k-quants @@ -175,3 +201,34 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { break; } } + +void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output + const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass + + GGML_TENSOR_BINARY_OP_LOCALS + + const float * src0_d = (const float *) src0->data; + const int32_t * src1_d = (const int32_t *) src1->data; + float * dst_d = (float *) dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_ASSERT(ne02*ne03 == 1); + GGML_ASSERT(ne12*ne13 == 1); + GGML_ASSERT(ne2*ne3 == 1); + + const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1); + const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE; + const dim3 block_nums(block_num_x, ne1, 1); + + k_get_rows_back_float<<>>(src0_d, src1_d, dst_d, ne00, ne10); +} diff --git a/ggml/src/ggml-cuda/getrows.cuh b/ggml/src/ggml-cuda/getrows.cuh index bbf130232..a1ca643f1 100644 --- a/ggml/src/ggml-cuda/getrows.cuh +++ b/ggml/src/ggml-cuda/getrows.cuh @@ -1,5 +1,8 @@ #include "common.cuh" #define CUDA_GET_ROWS_BLOCK_SIZE 256 +#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256 void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9118edc72..7fd1fc853 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2003,6 +2003,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_GET_ROWS: ggml_cuda_op_get_rows(ctx, dst); break; + case GGML_OP_GET_ROWS_BACK: + ggml_cuda_op_get_rows_back(ctx, dst); + break; case GGML_OP_DUP: ggml_cuda_dup(ctx, dst); break; @@ -2091,9 +2094,15 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_LEAKY_RELU: ggml_cuda_op_leaky_relu(ctx, dst); break; + case GGML_OP_SILU_BACK: + ggml_cuda_op_silu_back(ctx, dst); + break; case GGML_OP_RMS_NORM: ggml_cuda_op_rms_norm(ctx, dst); break; + case GGML_OP_RMS_NORM_BACK: + ggml_cuda_op_rms_norm_back(ctx, dst); + break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); @@ -2138,6 +2147,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SOFT_MAX: ggml_cuda_op_soft_max(ctx, dst); break; + case GGML_OP_SOFT_MAX_BACK: + ggml_cuda_op_soft_max_back(ctx, dst); + break; case GGML_OP_ROPE: ggml_cuda_op_rope(ctx, dst); break; @@ -2912,7 +2924,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } } break; case GGML_OP_OUT_PROD: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; + return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -2928,6 +2940,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } } break; + case GGML_OP_GET_ROWS_BACK: + { + return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; + } break; case GGML_OP_CPY: { ggml_type src0_type = op->src[0]->type; @@ -3001,8 +3017,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } return false; } break; + case GGML_OP_SILU_BACK: + return ggml_is_contiguous(op->src[0]); + break; case GGML_OP_NORM: case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; break; case GGML_OP_NONE: @@ -3027,6 +3047,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: return true; + case GGML_OP_SOFT_MAX_BACK: { + float max_bias = 0.0f; + memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float)); + return max_bias == 0.0f; + } case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { const size_t ts = ggml_type_size(op->src[0]->type); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 133e219f0..d991ec972 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -5,20 +5,24 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; - float2 mean_var = make_float2(0.f, 0.f); + x += int64_t(row)*ncols; + dst += int64_t(row)*ncols; + + float2 mean_var = make_float2(0.0f, 0.0f); for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; + const float xi = x[col]; mean_var.x += xi; mean_var.y += xi * xi; } // sum up partial sums mean_var = warp_reduce_sum(mean_var); - if (block_size > WARP_SIZE) { + if constexpr (block_size > WARP_SIZE) { + static_assert(block_size == 1024, "unexpected block_size"); __shared__ float2 s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; if (lane_id == 0) { s_sum[warp_id] = mean_var; } @@ -32,7 +36,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c const float inv_std = rsqrtf(var + eps); for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; + dst[col] = (x[col] - mean) * inv_std; } } @@ -40,14 +44,8 @@ template static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { // blockIdx.x: num_groups idx // threadIdx.x: block_size idx - int start = blockIdx.x * group_size; - int end = start + group_size; - - start += threadIdx.x; - - if (end >= ne_elements) { - end = ne_elements; - } + const int start = blockIdx.x*group_size + threadIdx.x; + const int end = min(blockIdx.x*group_size + group_size, ne_elements); float tmp = 0.0f; // partial sum for thread in warp @@ -56,10 +54,11 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr } tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { + if constexpr (block_size > WARP_SIZE) { + static_assert(block_size == 1024, "unexpected block_size"); __shared__ float s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; if (lane_id == 0) { s_sum[warp_id] = tmp; } @@ -68,11 +67,11 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr tmp = warp_reduce_sum(tmp); } - float mean = tmp / group_size; + const float mean = tmp / group_size; tmp = 0.0f; for (int j = start; j < end; j += block_size) { - float xi = x[j] - mean; + const float xi = x[j] - mean; dst[j] = xi; tmp += xi * xi; } @@ -80,8 +79,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr tmp = warp_reduce_sum(tmp); if (block_size > WARP_SIZE) { __shared__ float s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; if (lane_id == 0) { s_sum[warp_id] = tmp; } @@ -90,8 +89,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr tmp = warp_reduce_sum(tmp); } - float variance = tmp / group_size; - float scale = rsqrtf(variance + eps); + const float variance = tmp / group_size; + const float scale = rsqrtf(variance + eps); for (int j = start; j < end; j += block_size) { dst[j] *= scale; } @@ -102,19 +101,23 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; + x += int64_t(row)*ncols; + dst += int64_t(row)*ncols; + float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; + const float xi = x[col]; tmp += xi * xi; } // sum up partial sums tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { + if constexpr (block_size > WARP_SIZE) { + static_assert(block_size == 1024, "unexpected block_size"); __shared__ float s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; if (lane_id == 0) { s_sum[warp_id] = tmp; } @@ -127,12 +130,63 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol const float scale = rsqrtf(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = scale * x[row*ncols + col]; + dst[col] = scale * x[col]; + } +} + +template +static __global__ void rms_norm_back_f32( + const float * grad, const float * xf, float * dst, const int ncols, const float eps) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + grad += int64_t(row)*ncols; + xf += int64_t(row)*ncols; + dst += int64_t(row)*ncols; + + float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass + float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs + + for (int col = tid; col < ncols; col += block_size) { + const float xfi = xf[col]; + sum_xx += xfi * xfi; + sum_xg += xfi * grad[col]; + } + + // sum up partial sums + sum_xx = warp_reduce_sum(sum_xx); + sum_xg = warp_reduce_sum(sum_xg); + if constexpr (block_size > WARP_SIZE) { + static_assert(block_size == 1024, "unexpected block_size"); + __shared__ float s_sum_xx[32]; + __shared__ float s_sum_xg[32]; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum_xx[warp_id] = sum_xx; + s_sum_xg[warp_id] = sum_xg; + } + __syncthreads(); + + sum_xx = s_sum_xx[lane_id]; + sum_xx = warp_reduce_sum(sum_xx); + + sum_xg = s_sum_xg[lane_id]; + sum_xg = warp_reduce_sum(sum_xg); + } + + const float mean_eps = sum_xx / ncols + eps; + const float sum_eps = sum_xx + ncols*eps; + + const float scale_grad = rsqrtf(mean_eps); + const float scale_x = -scale_grad * sum_xg/sum_eps; + + for (int col = tid; col < ncols; col += block_size) { + dst[col] = scale_grad*grad[col] + scale_x*xf[col]; } } static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { - GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); norm_f32<<>>(x, dst, ncols, eps); @@ -142,7 +196,8 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i } } -static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) { +static void group_norm_f32_cuda( + const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) { if (group_size < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); group_norm_f32<<>>(x, dst, group_size, ne_elements, eps); @@ -153,7 +208,6 @@ static void group_norm_f32_cuda(const float * x, float * dst, const int num_grou } static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { - GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); rms_norm_f32<<>>(x, dst, ncols, eps); @@ -163,6 +217,16 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con } } +static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + rms_norm_back_f32<<>>(grad, xf, dst, ncols, eps); + } else { + const dim3 block_dims(1024, 1, 1); + rms_norm_back_f32<1024><<>>(grad, xf, dst, ncols, eps); + } +} + void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; @@ -179,6 +243,7 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps >= 0.0f); norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); } @@ -198,6 +263,7 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) float eps; memcpy(&eps, dst->op_params + 1, sizeof(float)); + GGML_ASSERT(eps >= 0.0f); int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream); @@ -219,6 +285,33 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps >= 0.0f); rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); } + +void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * grad = dst->src[0]; // gradients + const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass + + const float * grad_d = (const float *) grad->data; + const float * src0f_d = (const float *) src0f->data; + float * dst_d = (float *) dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(grad)); + + GGML_ASSERT( grad->type == GGML_TYPE_F32); + GGML_ASSERT(src0f->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0f->ne[0]; + const int64_t nrows = ggml_nrows(src0f); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps >= 0.0f); + + rms_norm_back_f32_cuda(grad_d, src0f_d, dst_d, ne00, nrows, eps, stream); +} diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh index 431a8f74d..d63d34380 100644 --- a/ggml/src/ggml-cuda/norm.cuh +++ b/ggml/src/ggml-cuda/norm.cuh @@ -5,3 +5,5 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index 619cfdcb5..73e3e2c47 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -11,16 +11,15 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ne01 == ne11); GGML_ASSERT(ne0 == ne00); GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == src0->ne[2]); + GGML_ASSERT(ne2 % src0->ne[2] == 0); + GGML_ASSERT(ne3 % src0->ne[3] == 0); + GGML_ASSERT(ne2 == src1->ne[2]); - GGML_ASSERT(ne3 == src0->ne[3]); GGML_ASSERT(ne3 == src1->ne[3]); const float * src0_d = (const float *) src0->data; @@ -33,8 +32,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const float alpha = 1.0f; const float beta = 0.0f; - GGML_ASSERT(ne2 == 1); - GGML_ASSERT(ne3 == 1); CUBLAS_CHECK(cublasSetStream(handle, stream)); const bool src1_T = ggml_is_transposed(src1); @@ -42,10 +39,27 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float)); - CUBLAS_CHECK( - cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op, - ne0, ne1, ne01, - &alpha, src0_d, ne00, - src1_d, ldb, - &beta, dst_d, ne0)); + // data strides in dimensions 2/3 + const size_t s02 = nb02 / sizeof(float); + const size_t s03 = nb03 / sizeof(float); + const size_t s12 = nb12 / sizeof(float); + const size_t s13 = nb13 / sizeof(float); + const size_t s2 = nb2 / sizeof(float); + const size_t s3 = nb3 / sizeof(float); + + // dps == dst per src0, used for group query attention + const int64_t dps2 = ne2 / ne02; + const int64_t dps3 = ne3 / ne03; + + // TODO batched matrix multiplication + for (int64_t i3 = 0; i3 < ne3; ++i3) { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + CUBLAS_CHECK( + cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op, + ne0, ne1, ne01, + &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, ne00, + src1_d + i3 *s13 + i2 *s12, ldb, + &beta, dst_d + i3 *s3 + i2 *s2, ne0)); + } + } } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index e1912fee1..18f691b2d 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -39,9 +39,9 @@ static __device__ void rope_yarn( template static __global__ void rope_norm( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, - const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) { + const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, + const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { @@ -83,9 +83,9 @@ static __global__ void rope_norm( template static __global__ void rope_neox( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, - const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) { + const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, + const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { @@ -127,9 +127,9 @@ static __global__ void rope_neox( template static __global__ void rope_multi( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, - const int n_dims, const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) { + const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, + const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { @@ -187,9 +187,9 @@ static __global__ void rope_multi( template static __global__ void rope_vision( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, - const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, - const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) { + const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, + const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, + const float theta_scale, const float * freq_factors, const mrope_sections sections) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { @@ -234,9 +234,9 @@ static __global__ void rope_vision( template static void rope_norm_cuda( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr, - const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) { + const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -257,9 +257,9 @@ static void rope_norm_cuda( template static void rope_neox_cuda( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr, - const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) { + const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -280,9 +280,9 @@ static void rope_neox_cuda( template static void rope_multi_cuda( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, - const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) { + const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); @@ -303,9 +303,9 @@ static void rope_multi_cuda( template static void rope_vision_cuda( - const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, - const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) { + const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, + const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, + const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) { GGML_ASSERT(ne0 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index c24abae1f..9aa4b8489 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -1,5 +1,7 @@ #include "common.cuh" +#include "ggml.h" #include "softmax.cuh" +#include template static __device__ __forceinline__ float t2f32(T val) { @@ -11,14 +13,20 @@ __device__ float __forceinline__ t2f32(half val) { return __half2float(val); } -template -static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { +template +static __global__ void soft_max_f32( + const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, + const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int tid = threadIdx.x; const int rowx = blockIdx.x; const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension + x += int64_t(rowx)*ncols; + mask += int64_t(rowy)*ncols * (mask != nullptr); + dst += int64_t(rowx)*ncols; + const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; const int warp_id = threadIdx.x / WARP_SIZE; @@ -29,7 +37,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst extern __shared__ float data_soft_max_f32[]; float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication // shared memory buffer to cache values between iterations: - float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols; + float * vals = use_shared ? buf_iw + WARP_SIZE : dst; float max_val = -INFINITY; @@ -41,10 +49,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst break; } - const int64_t ix = (int64_t)rowx*ncols + col; - const int64_t iy = (int64_t)rowy*ncols + col; - - const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f); + const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f); vals[col] = val; max_val = max(max_val, val); @@ -110,8 +115,29 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst return; } - const int64_t idst = (int64_t)rowx*ncols + col; - dst[idst] = vals[col] * inv_sum; + dst[col] = vals[col] * inv_sum; + } +} + +static __global__ void soft_max_back_f32( + const float * grad, const float * dstf, float * dst, const int ncols, const float scale) { + const int tid = threadIdx.x; + const int rowx = blockIdx.x; + + grad += int64_t(rowx)*ncols; + dstf += int64_t(rowx)*ncols; + dst += int64_t(rowx)*ncols; + + float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients + + for (int col = tid; col < ncols; col += WARP_SIZE) { + dgf_dot += dstf[col]*grad[col]; + } + + dgf_dot = warp_reduce_sum(dgf_dot); + + for (int col = tid; col < ncols; col += WARP_SIZE) { + dst[col] = scale * (grad[col] - dgf_dot) * dstf[col]; } } @@ -121,7 +147,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; const dim3 block_dims(nth, 1, 1); const dim3 block_nums(nrows_x, 1, 1); - const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); + const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); const uint32_t n_head = nrows_x/nrows_y; @@ -131,50 +157,68 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); // FIXME: this limit could be raised by ~2-4x on Ampere or newer - if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { + if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { switch (ncols_x) { case 32: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 64: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 128: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 256: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 512: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 1024: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 2048: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 4096: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; default: - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + soft_max_f32<<>> + (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; } } else { - const size_t shmem_low = WARP_SIZE*sizeof(float); - soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + const size_t nbytes_shared_low = WARP_SIZE*sizeof(float); + soft_max_f32<<>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); } } +static void soft_max_back_f32_cuda( + const float * grad, const float * dstf, float * dst, + const int ncols, const int nrows, const float scale, cudaStream_t stream) { + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + + soft_max_back_f32<<>>(grad, dstf, dst, ncols, scale); +} + void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const void * src1_d = src1 ? (const void *)src1->data : nullptr; + const float * src0_d = (const float *) src0->data; + const void * src1_d = src1 ? (const void *) src1->data : nullptr; + float * dst_d = (float *) dst->data; - float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -189,18 +233,42 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float scale = 1.0f; float max_bias = 0.0f; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); if (use_f16) { - const half * src1_dd = (const half *)src1_d; - - soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); + soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); } else { - const float * src1_dd = (const float *)src1_d; - - soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); + soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); } } + +void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // grad + const ggml_tensor * src1 = dst->src[1]; // forward pass output + + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + + GGML_ASSERT(max_bias == 0.0f); + + soft_max_back_f32_cuda(src0_d, src1_d, dst_d, ncols, nrows, scale, stream); +} diff --git a/ggml/src/ggml-cuda/softmax.cuh b/ggml/src/ggml-cuda/softmax.cuh index 4ef4ff86c..93dfee835 100644 --- a/ggml/src/ggml-cuda/softmax.cuh +++ b/ggml/src/ggml-cuda/softmax.cuh @@ -3,3 +3,5 @@ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024 void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 81fc92202..6b21f407d 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -51,6 +51,19 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) { dst[i] = x[i] / (1.0f + expf(-x[i])); } +static __global__ void silu_back_f32( + const float * grad, const float * xf, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + const float xfi = xf[i]; + const float s = 1.0f / (1.0f + expf(-xfi)); + dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s)); +} + static __global__ void tanh_f32(const float * x, float * dst, int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -173,6 +186,11 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_ silu_f32<<>>(x, dst, k); } +static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; + silu_back_f32<<>>(grad, x, dst, k); +} + static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; tanh_f32<<>>(x, dst, k); @@ -284,6 +302,24 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); } +void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // input from forward pass + const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output + + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream); +} + void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index c91936728..e7f62643a 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -4,6 +4,7 @@ #define CUDA_STEP_BLOCK_SIZE 256 #define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_SILU_BACK_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256 #define CUDA_RELU_BLOCK_SIZE 256 #define CUDA_SIGMOID_BLOCK_SIZE 256 @@ -23,6 +24,8 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ecfb84a80..b1d0d4913 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3450,12 +3450,14 @@ struct ggml_tensor * ggml_soft_max_ext( return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false); } -// ggml_soft_max_back +// ggml_soft_max_ext_back -static struct ggml_tensor * ggml_soft_max_back_impl( +static struct ggml_tensor * ggml_soft_max_ext_back_impl( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, + float scale, + float max_bias, bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -3463,21 +3465,28 @@ static struct ggml_tensor * ggml_soft_max_back_impl( result->src[0] = a; result->src[1] = b; + memcpy((float *) result->op_params + 0, &scale, sizeof(float)); + memcpy((float *) result->op_params + 1, &max_bias, sizeof(float)); + return result; } -struct ggml_tensor * ggml_soft_max_back( +struct ggml_tensor * ggml_soft_max_ext_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_soft_max_back_impl(ctx, a, b, false); + struct ggml_tensor * b, + float scale, + float max_bias) { + return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false); } -struct ggml_tensor * ggml_soft_max_back_inplace( +struct ggml_tensor * ggml_soft_max_ext_back_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_soft_max_back_impl(ctx, a, b, true); + struct ggml_tensor * b, + float scale, + float max_bias) { + return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true); } // ggml_rope @@ -5076,10 +5085,10 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c) { - GGML_ASSERT(ggml_are_same_shape(a, b)); - GGML_ASSERT(ggml_is_scalar(c)); + GGML_ASSERT(ggml_is_scalar(a)); + GGML_ASSERT(ggml_are_same_shape(b, c)); - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, b); result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->src[0] = a; @@ -5258,7 +5267,7 @@ static void ggml_sub_or_set( } static void ggml_compute_backward( - struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) { + struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, const bool * grads_needed) { struct ggml_tensor * tensor = cgraph->nodes[i]; struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, tensor); @@ -5402,7 +5411,7 @@ static void ggml_compute_backward( if (src0_needs_grads) { float eps; memcpy(&eps, tensor->op_params, sizeof(float)); - ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps)); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, grad, src0, eps)); } } break; case GGML_OP_MUL_MAT: { @@ -5585,7 +5594,13 @@ static void ggml_compute_backward( } break; case GGML_OP_SOFT_MAX: { if (src0_needs_grads) { - ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor)); + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (const float *) tensor->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) tensor->op_params + 1, sizeof(float)); + + ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_ext_back(ctx, grad, tensor, scale, max_bias)); } GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented"); } break; @@ -5626,7 +5641,7 @@ static void ggml_compute_backward( const int32_t d1 = ggml_get_op_params_i32(tensor, 5); const bool is_2D = ggml_get_op_params_i32(tensor, 6) == 1; - ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D)); + ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, grad, src0, src1->ne, s0, s1, p0, p1, d0, d1, is_2D)); } } break; case GGML_OP_POOL_2D: { @@ -5669,7 +5684,7 @@ static void ggml_compute_backward( } break; case GGML_UNARY_OP_SILU: { if (src0_needs_grads) { - ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad)); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, grad, src0)); } } break; case GGML_UNARY_OP_EXP: { @@ -5686,7 +5701,7 @@ static void ggml_compute_backward( } break; case GGML_OP_CROSS_ENTROPY_LOSS: { if (src0_needs_grads) { - ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad)); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, grad, src0, src1)); } GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented"); } break; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 4c8464d8b..0ed3e98e1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -780,7 +780,7 @@ struct test_case { } } if (!any_params) { - printf("not supported [%s] \n", op_name); + printf("not supported [%s] \n", op_desc(out).c_str()); supported = false; } if (!supported) { @@ -1130,6 +1130,59 @@ struct test_get_rows : public test_case { } }; +// GGML_OP_GET_ROWS_BACK +struct test_get_rows_back : public test_case { + const ggml_type type; + const int n; // cols + const int m; // rows + const int r; // rows to get + const int b; // batch size + const bool v; // view (non-contiguous src1) + + std::string vars() override { + return VARS_TO_STR6(type, n, m, r, b, v); + } + + test_get_rows_back(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false) + : type(type), n(n), m(m), r(r), b(b), v(v) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * in_forward = ggml_new_tensor_3d(ctx, type, n, m, b); + ggml_set_name(in_forward, "in_forward"); + + ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b); + ggml_set_name(rows, "rows"); + if (v) { + rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0); + ggml_set_name(rows, "view_of_rows"); + } + + ggml_tensor * grad = ggml_new_tensor_3d(ctx, type, n, r, b); + ggml_set_name(grad, "grad"); + + ggml_tensor * out = ggml_get_rows_back(ctx, grad, rows, in_forward); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_I32) { + if (ggml_is_view_op(t->op)) { continue; } + // rows + std::vector data(r*b); + for (int i = 0; i < r*b; i++) { + data[i] = rand() % m; + } + ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int)); + } else { + init_tensor_uniform(t); + } + } + } +}; + // GGML_OP_ARGMAX struct test_argmax : public test_case { const ggml_type type; @@ -1531,6 +1584,39 @@ struct test_scale : public test_case { } }; +// GGML_OP_SILU_BACK +struct test_silu_back : public test_case { + const ggml_type type; + const std::array ne; + float eps; + + std::string vars() override { + return VARS_TO_STR3(type, ne, eps); + } + + test_silu_back(ggml_type type = GGML_TYPE_F32, + std::array ne = {64, 5, 4, 3}, + float eps = 1e-6f) + : type(type), ne(ne), eps(eps) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * grad = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(grad, "grad"); + + ggml_tensor * out = ggml_silu_back(ctx, a, grad); + ggml_set_name(out, "out"); + + return out; + } + + bool grad_precise() override { + return true; + } +}; + // GGML_OP_NORM struct test_norm : public test_case { const ggml_type type; @@ -1583,11 +1669,56 @@ struct test_rms_norm : public test_case { return out; } + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.f, 10.f); + } + } + + float grad_eps() override { + return 1.0f; + } + bool grad_precise() override { return true; } }; +// GGML_OP_RMS_NORM_BACK +struct test_rms_norm_back : public test_case { + const ggml_type type; + const std::array ne; + float eps; + + std::string vars() override { + return VARS_TO_STR3(type, ne, eps); + } + + test_rms_norm_back(ggml_type type = GGML_TYPE_F32, + std::array ne = {64, 5, 4, 3}, + float eps = 1e-6f) + : type(type), ne(ne), eps(eps) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(b, "b"); + + ggml_tensor * out = ggml_rms_norm_back(ctx, a, b, eps); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.f, 10.f); + } + } +}; + // GGML_OP_SSM_CONV struct test_ssm_conv : public test_case { const ggml_type type; @@ -1855,10 +1986,11 @@ struct test_out_prod : public test_case { const int64_t n; const int64_t k; const std::array bs; // dims 3 and 4 + const std::array nr; // repeat in dims 3 and 4 const bool trans_b; std::string vars() override { - return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b); + return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, trans_b); } double max_nmse_err() override { @@ -1868,8 +2000,9 @@ struct test_out_prod : public test_case { test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int64_t m = 32, int64_t n = 32, int64_t k = 32, std::array bs = {10, 10}, + std::array nr = {2, 2}, bool trans_b = false) - : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {} + : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), trans_b(trans_b) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]); @@ -1877,10 +2010,10 @@ struct test_out_prod : public test_case { ggml_tensor * b; if (trans_b) { - b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]); + b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); b = ggml_transpose(ctx, b); } else { - b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]); + b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0]*nr[0], bs[1]*nr[1]); } ggml_set_name(b, "b"); @@ -2191,6 +2324,36 @@ struct test_soft_max : public test_case { } }; +// GGML_OP_SOFT_MAX_BACK +struct test_soft_max_back : public test_case { + const ggml_type type; + const std::array ne; + const float scale; + const float max_bias; + + std::string vars() override { + return VARS_TO_STR4(type, ne, scale, max_bias); + } + + test_soft_max_back(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}, + float scale = 1.0f, + float max_bias = 0.0f) + : type(type), ne(ne), scale(scale), max_bias(max_bias) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_soft_max_ext_back(ctx, a, b, scale, max_bias); + ggml_set_name(out, "out"); + + return out; + } +}; // GGML_OP_ROPE + GGML_OP_ROPE_BACK struct test_rope : public test_case { @@ -2980,6 +3143,40 @@ struct test_cross_entropy_loss : public test_case { } }; +// GGML_OP_CROSS_ENTROPY_LOSS_BACK +struct test_cross_entropy_loss_back : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_cross_entropy_loss_back(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * grad = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + ggml_set_name(grad, "grad"); + + ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(logits, "logits"); + + ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(labels, "labels"); + + // Ensure labels add up to 1: + labels = ggml_soft_max(ctx, labels); + ggml_set_name(labels, "labels_normalized"); + + ggml_tensor * out = ggml_cross_entropy_loss_back(ctx, grad, logits, labels); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_OPT_STEP_ADAMW struct test_opt_step_adamw : public test_case { const ggml_type type; @@ -3479,6 +3676,16 @@ static std::vector> make_test_cases_eval() { } } + test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false)); + for (ggml_type type : all_types) { + for (bool v : {false, true}) { + test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v)); + } + } + for (bool v : {false, true}) { + test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_I32, 256, 5, 4, 1, v)); + } + for (ggml_type type_input : {GGML_TYPE_F32}) { for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) { for (int k0 : {1, 3}) { @@ -3657,10 +3864,12 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_add1()); test_cases.emplace_back(new test_scale()); + test_cases.emplace_back(new test_silu_back()); - for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) { - test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); - test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); + for (float eps : {0.0f, 1e-7f, 1e-4f, 1e-1f}) { + test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); + test_cases.emplace_back(new test_rms_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); + test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1})); @@ -3800,22 +4009,19 @@ static std::vector> make_test_cases_eval() { for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); - - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true)); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); - test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); + for (int n : {1, 16}) { + for (int k : {1, 16}) { + for (int bs2 : {1, 3}) { + for (int bs3 : {1, 3}) { + for (int nr2 : {1, 2}) { + for (int nr3 : {1, 2}) { + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, n, k, {bs2, bs3}, {nr2, nr3})); + } + } + } + } + } + } } } @@ -3858,11 +4064,22 @@ static std::vector> make_test_cases_eval() { } } } - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); + for (float max_bias : {0.0f, 8.0f}) { + for (float scale : {1.0f, 0.1f}) { + for (int64_t ne0 : {16, 1024}) { + for (int64_t ne1 : {16, 1024}) { + test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 1, 1}, scale, max_bias)); + test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias)); + } + } + } + } + for (bool fw : {true, false}) { // fw == forward bool all = true; @@ -3953,7 +4170,11 @@ static std::vector> make_test_cases_eval() { } } - test_cases.emplace_back(new test_cross_entropy_loss()); + test_cases.emplace_back(new test_cross_entropy_loss (GGML_TYPE_F32, { 10, 5, 4, 3})); + test_cases.emplace_back(new test_cross_entropy_loss (GGML_TYPE_F32, {30000, 1, 1, 1})); + test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, { 10, 5, 4, 3})); + test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1})); + test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3})); // these tests are disabled to save execution time, but they can be handy for debugging From 4dbc8b9cb71876e005724f4e8f73a3544646bcf5 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Fri, 17 Jan 2025 02:10:38 +0800 Subject: [PATCH 11/30] llama : add internlm3 support (#11233) * support internlm3 * fix lint --- convert_hf_to_gguf.py | 60 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4dc9837ab..95f112043 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2882,6 +2882,66 @@ class InternLM2Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("InternLM3ForCausalLM") +class InternLM3Model(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + if "added_tokens_decoder" in tokenizer_config_json: + for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): + if token_data.get("special"): + token_id = int(token_id) + token = token_data["content"] + special_vocab._set_special_token(token, token_id) + # update eos token + if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: + special_vocab.special_token_ids["eos"] = token_id + + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("BertModel", "BertForMaskedLM", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT From 206bc53422521a67de5e2caba661e21d590d2bae Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 16 Jan 2025 15:16:39 -0600 Subject: [PATCH 12/30] vulkan: optimize coopmat2 q2_k dequant function (#11130) --- .../vulkan-shaders/dequant_funcs_cm2.comp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp index 94b78598e..e768b8930 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -101,19 +101,25 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_ block_q2_K block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2_K_packed16 { + block_q2_K_packed16 block; +}; + float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { + decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl); const f16vec2 d = bl.block.d; const uint idx = coordInBlock[1]; - const uint iqs = idx; - const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31 - const uint scalesi = iqs / 16; // 0..15 - const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6 + const uint scalesi = (idx & 0xF0) >> 4; // 0..15 + const uint qsshift = (idx & 0x60) >> 4; // 0,2,4,6 + + uint qs = uint32_t(bl16.block.qs[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]); + qs = (qs >> qsshift) & 0x0303; + qs = unpack8(qs)[idx & 1]; - uint32_t qs = bl.block.qs[qsi]; const uint scales = bl.block.scales[scalesi]; - float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4); + float16_t ret = d.x * float16_t(scales & 0xF) * float16_t(qs) - d.y * float16_t(scales >> 4); return ret; } From 466300fe1416de2802b710215817db28d4496f41 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 16 Jan 2025 15:23:49 -0600 Subject: [PATCH 13/30] vulkan: optimize coopmat2 q4_k/q5_k dequant functions. (#11206) Do masking on whole dwords, fetch all scales at once. --- .../vulkan-shaders/dequant_funcs_cm2.comp | 80 +++++++++++-------- .../src/ggml-vulkan/vulkan-shaders/types.comp | 10 +++ 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp index e768b8930..175e31fa7 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -163,39 +163,47 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4 block_q4_K_packed16 block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 { + block_q4_K_packed128 block; +}; + float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl); + decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl); const uint idx = coordInBlock[1]; const uint b = (idx & 0x20) >> 5; // 0,1 const uint is = (idx & 0xE0) >> 5; // 0..7 - const f16vec2 loadd = bl.block.d; + uvec4 v = bl128.block.q4k[0]; + + const f16vec2 loadd = unpackFloat2x16(v.x); uint32_t sc; uint32_t mbyte; - uint32_t scidx0 = (is < 4) ? is : (is + 4); - uint32_t scidx1 = (is < 4) ? is : (is - 4); - uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; - uint32_t scidxshift1 = (is < 4) ? 0 : 2; - uint32_t mbidx0 = is + 4; - uint32_t mbidx1 = (is < 4) ? is + 4 : is; - uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; - uint32_t mbidxshift0 = (is < 4) ? 0 : 4; - uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; - uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + uint32_t scale0 = v.y; + uint32_t scale4 = v.z; + uint32_t scale8 = v.w; - sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); - mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + uint32_t sc_lo = scale0; + uint32_t mb_lo = scale4; + uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2); + uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2); + + sc = is < 4 ? sc_lo : sc_hi; + mbyte = is < 4 ? mb_lo : mb_hi; + sc = sc >> (8 * (is & 3)); + mbyte = mbyte >> (8 * (is & 3)); + sc &= 0x3F; + mbyte &= 0x3F; const float16_t d = loadd.x * float16_t(sc); const float16_t m = loadd.y * float16_t(mbyte); uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]); - qs = (qs >> (b * 4)) & 0x0F0F; - qs = unpack8(qs)[idx & 1]; + qs = (qs >> (b * 4 + 8 * (idx & 1))) & 0xF; float16_t ret = d * float16_t(qs) - m; @@ -210,47 +218,53 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5 block_q5_K_packed16 block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed128 { + block_q5_K_packed128 block; +}; + float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl); + decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl); const uint idx = coordInBlock[1]; const uint b = (idx & 0x20) >> 5; // 0,1 const uint is = (idx & 0xE0) >> 5; // 0..7 - const uint32_t hm = 0x0101 << is; + uvec4 v = bl128.block.q5k[0]; - const f16vec2 loadd = bl.block.d; + const f16vec2 loadd = unpackFloat2x16(v.x); uint32_t sc; uint32_t mbyte; - uint32_t scidx0 = (is < 4) ? is : (is + 4); - uint32_t scidx1 = (is < 4) ? is : (is - 4); - uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; - uint32_t scidxshift1 = (is < 4) ? 0 : 2; - uint32_t mbidx0 = is + 4; - uint32_t mbidx1 = (is < 4) ? is + 4 : is; - uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; - uint32_t mbidxshift0 = (is < 4) ? 0 : 4; - uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; - uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + uint32_t scale0 = v.y; + uint32_t scale4 = v.z; + uint32_t scale8 = v.w; - sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); - mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + uint32_t sc_lo = scale0; + uint32_t mb_lo = scale4; + uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2); + uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2); + + sc = is < 4 ? sc_lo : sc_hi; + mbyte = is < 4 ? mb_lo : mb_hi; + sc = sc >> (8 * (is & 3)); + mbyte = mbyte >> (8 * (is & 3)); + sc &= 0x3F; + mbyte &= 0x3F; const float16_t d = loadd.x * float16_t(sc); const float16_t m = loadd.y * float16_t(mbyte); uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]); - qh = qh & hm; - qh = unpack8(qh)[idx & 1]; + qh = ((qh >> is) & 0x101) << 4; uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]); qs = (qs >> (b * 4)) & 0x0F0F; - qs = unpack8(qs)[idx & 1]; + qs = unpack8(qs | qh)[idx & 1]; - float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m; + float16_t ret = d * (float16_t(qs)) - m; return ret; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index f12e61bbe..1e35b6652 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -227,6 +227,11 @@ struct block_q4_K_packed32 uint32_t qs[QUANT_K_Q4_K/2/4]; }; +struct block_q4_K_packed128 +{ + uvec4 q4k[9]; +}; + #if defined(DATA_A_Q4_K) #define QUANT_K QUANT_K_Q4_K #define A_TYPE block_q4_K @@ -252,6 +257,11 @@ struct block_q5_K_packed16 uint16_t qs[QUANT_K_Q5_K/2/2]; }; +struct block_q5_K_packed128 +{ + uvec4 q5k[11]; +}; + #if defined(DATA_A_Q5_K) #define QUANT_K QUANT_K_Q5_K #define A_TYPE block_q5_K From bd38ddea0181bc717de7cae66fd4323975c85656 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 16 Jan 2025 15:47:10 -0600 Subject: [PATCH 14/30] vulkan: support copy from f32 to q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl (#11166) * vulkan: support copy from f32 to q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl Shaders are based on cpy.cu. * vulkan: support copy from q4_0/q4_1/q5_0/q5_1/q8_0/iq4_nl to f32 * ggml: copy q->f32 assumes some contiguity in the destination --- ggml/src/ggml-cpu/ggml-cpu.c | 55 ++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 77 +++++- .../vulkan-shaders/copy_from_quant.comp | 51 ++++ .../vulkan-shaders/copy_to_quant.comp | 237 ++++++++++++++++++ .../vulkan-shaders/generic_unary_head.comp | 20 ++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 5 + tests/test-backend-ops.cpp | 6 + 7 files changed, 446 insertions(+), 5 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index dd9995562..0ed92b3ff 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3967,6 +3967,57 @@ static void ggml_compute_forward_dup_bytes( } } +static void ggml_compute_forward_dup_q( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; + + size_t qk = ggml_blck_size(type); + const int64_t nr = ggml_nelements(src1) / qk; + + // destination must be contiguous in the first dimension + GGML_ASSERT(nb10 == ggml_type_size(dst->type)); + // must either have first dimension large enough to hold a row, or fully contiguous + GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + + uint32_t i = ir * qk; + + const int64_t i03 = i/(ne00 * ne01 * ne02); + const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; + + const int64_t i13 = i/(ne10 * ne11 * ne12); + const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + + dequantize_row_q( + (const void *) ((char *) src0->data + x_offset), + (float *) ((char *) dst->data + dst_offset), qk); + } +} + static void ggml_compute_forward_dup( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -3993,6 +4044,10 @@ static void ggml_compute_forward_dup( } break; default: { + if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) { + ggml_compute_forward_dup_q(params, dst); + break; + } GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 649146d7b..8e3e91495 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -228,6 +228,8 @@ struct vk_device_struct { vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16; + vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; + vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; @@ -1965,6 +1967,20 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1); @@ -3689,6 +3705,33 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_cpy_f16_f16; } } + if (src->type == GGML_TYPE_F32) { + switch (to) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return ctx->device->pipeline_cpy_f32_quant[to]; + default: + break; + } + } + + if (to == GGML_TYPE_F32) { + switch (src->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return ctx->device->pipeline_cpy_quant_f32[src->type]; + default: + break; + } + } std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; GGML_ABORT("fatal error"); @@ -5160,7 +5203,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); - GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT + GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT GGML_ASSERT(dst->buffer != nullptr); const uint64_t ne00 = src0->ne[0]; @@ -7905,12 +7948,36 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm { ggml_type src0_type = op->src[0]->type; ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type; - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { - return true; + + if (src0_type == GGML_TYPE_F32) { + switch (src1_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return true; + default: + break; + } } - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { - return true; + if (src1_type == GGML_TYPE_F32) { + switch (src0_type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_IQ4_NL: + return true; + default: + break; + } } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return true; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp new file mode 100644 index 000000000..c09bf496b --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp @@ -0,0 +1,51 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" +#include "dequant_funcs.comp" + +#if defined(DATA_A_IQ4_NL) +// 16 invocations needed for init_iq4nl_shmem +layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in; +#else +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); + if (gl_LocalInvocationIndex.x != 0) { + return; + } +#endif + + const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K; + + if (idx >= p.ne) { + return; + } + + uint dst_idx = get_doffset() + dst_idx(idx); + uint src_idx = src0_idx_quant(idx, QUANT_K); + + const uint a_offset = 0; + const uint ib = src_idx; + const vec2 dm = get_dm(ib, a_offset); + + [[unroll]] for (int j = 0; j < QUANT_K; j += 4) { + vec4 v = dequantize4(ib, j / QUANT_R, a_offset); + v = v * dm.x + vec4(dm.y); + +#if QUANT_R == 2 + data_d[dst_idx + j/2 + 0] = v[0]; + data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1]; + data_d[dst_idx + j/2 + 1] = v[2]; + data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3]; +#else + data_d[dst_idx + j + 0] = v[0]; + data_d[dst_idx + j + 1] = v[1]; + data_d[dst_idx + j + 2] = v[2]; + data_d[dst_idx + j + 3] = v[3]; +#endif + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp new file mode 100644 index 000000000..ccf5b980a --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp @@ -0,0 +1,237 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +#if defined(DATA_A_IQ4_NL) +// 16 invocations needed for init_iq4nl_shmem +layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in; +#else +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; +#endif + +layout (binding = 0) readonly buffer S {float data_s[];}; +layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];}; + +#if defined(DATA_A_Q4_0) +void quantize(uint dst_idx, uint src_idx) +{ + float amax = 0.0; + float vmax = 0.0; + + [[unroll]] for (int j = 0; j < QUANT_K_Q4_0; ++j) { + const float v = data_s[src_idx + j]; + if (amax < abs(v)) { + amax = abs(v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = (d != 0.0) ? 1.0/d : 0.0; + + data_q[dst_idx].d = float16_t(d); + + [[unroll]] for (int j = 0; j < QUANT_K_Q4_0/2; ++j) { + const float x0 = data_s[src_idx + 0 + j]*id; + const float x1 = data_s[src_idx + QUANT_K_Q4_0/2 + j]*id; + + const uint xi0 = min(15, int(x0 + 8.5)); + const uint xi1 = min(15, int(x1 + 8.5)); + + data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4)); + } +} +#endif + +#if defined(DATA_A_Q4_1) +void quantize(uint dst_idx, uint src_idx) +{ + float vmin = 1.0/0.0; + float vmax = -vmin; + + [[unroll]] for (int j = 0; j < QUANT_K_Q4_1; ++j) { + const float v = data_s[src_idx + j]; + + if (v < vmin) vmin = v; + if (v > vmax) vmax = v; + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = (d != 0.0) ? 1.0/d : 0.0; + + data_q[dst_idx].d = float16_t(d); + data_q[dst_idx].m = float16_t(vmin); + + [[unroll]] for (int j = 0; j < QUANT_K_Q4_1/2; ++j) { + const float x0 = (data_s[src_idx + 0 + j] - vmin)*id; + const float x1 = (data_s[src_idx + QUANT_K_Q4_1/2 + j] - vmin)*id; + + const uint xi0 = min(15, int(x0 + 0.5)); + const uint xi1 = min(15, int(x1 + 0.5)); + + data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4)); + } +} +#endif + +#if defined(DATA_A_Q5_0) +void quantize(uint dst_idx, uint src_idx) +{ + float amax = 0.0; + float vmax = 0.0; + + [[unroll]] for (int j = 0; j < QUANT_K_Q5_0; ++j) { + const float v = data_s[src_idx + j]; + if (amax < abs(v)) { + amax = abs(v); + vmax = v; + } + } + + const float d = vmax / -16; + const float id = (d != 0.0) ? 1.0/d : 0.0; + + data_q[dst_idx].d = float16_t(d); + + uint32_t qh = 0; + [[unroll]] for (int j = 0; j < QUANT_K_Q5_0/2; ++j) { + const float x0 = data_s[src_idx + 0 + j]*id; + const float x1 = data_s[src_idx + QUANT_K_Q5_0/2 + j]*id; + + const uint xi0 = min(31, int(x0 + 16.5)); + const uint xi1 = min(31, int(x1 + 16.5)); + + data_q[dst_idx].qs[j] = uint8_t((xi0 & 0xf) | ((xi1 & 0xf) << 4)); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QUANT_K_Q5_0/2); + } + data_q[dst_idx].qh[0] = uint16_t(qh & 0xFFFF); + data_q[dst_idx].qh[1] = uint16_t(qh >> 16); +} +#endif + +#if defined(DATA_A_Q5_1) +void quantize(uint dst_idx, uint src_idx) +{ + float min = data_s[src_idx + 0]; + float max = min; + + [[unroll]] for (int j = 1; j < QUANT_K_Q5_1; ++j) { + const float v = data_s[src_idx + j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = (d != 0) ? 1.0/d : 0.0; + + data_q[dst_idx].d = float16_t(d); + data_q[dst_idx].m = float16_t(min); + + uint32_t qh = 0; + [[unroll]] for (int j = 0; j < QUANT_K_Q5_1/2; ++j) { + const float x0 = (data_s[src_idx + 0 + j] - min)*id; + const float x1 = (data_s[src_idx + QUANT_K_Q5_1/2 + j] - min)*id; + + const uint xi0 = uint(x0 + 0.5); + const uint xi1 = uint(x1 + 0.5); + + data_q[dst_idx].qs[j] = uint8_t((xi0 & 0xf) | ((xi1 & 0xf) << 4)); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QUANT_K_Q5_1/2); + } + data_q[dst_idx].qh = qh; +} +#endif + +#if defined(DATA_A_Q8_0) +void quantize(uint dst_idx, uint src_idx) +{ + float amax = 0.0; // absolute max + + [[unroll]] for (int j = 0; j < QUANT_K_Q8_0; j++) { + const float v = data_s[src_idx + j]; + amax = max(amax, abs(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = (d != 0.0) ? 1.0/d : 0.0; + + data_q[dst_idx].d = float16_t(d); + + [[unroll]] for (int j = 0; j < QUANT_K_Q8_0; ++j) { + const float x0 = data_s[src_idx + j]*id; + + data_q[dst_idx].qs[j] = int8_t(round(x0)); + } +} +#endif + +#if defined(DATA_A_IQ4_NL) +uint best_index(float x) { + if (x <= kvalues_iq4nl[0]) return 0; + if (x >= kvalues_iq4nl[15]) return 15; + int ml = 0, mu = 15; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < kvalues_iq4nl[mav]) mu = mav; else ml = mav; + } + return x - kvalues_iq4nl[mu-1] < kvalues_iq4nl[mu] - x ? mu-1 : mu; +} + +void quantize(uint dst_idx, uint src_idx) +{ + float amax = 0.0; + float vmax = 0.0; + + [[unroll]] for (int j = 0; j < QUANT_K_IQ4_NL; ++j) { + const float v = data_s[src_idx + j]; + if (amax < abs(v)) { + amax = abs(v); + vmax = v; + } + } + + float d = vmax / kvalues_iq4nl[0]; + const float id = (d != 0.0) ? 1.0/d : 0.0; + + float sumqx = 0, sumq2 = 0; + [[unroll]] for (int j = 0; j < QUANT_K_IQ4_NL/2; ++j) { + const float x0 = data_s[src_idx + 0 + j]*id; + const float x1 = data_s[src_idx + QUANT_K_IQ4_NL/2 + j]*id; + const uint xi0 = best_index(x0); + const uint xi1 = best_index(x1); + data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4)); + const float v0 = kvalues_iq4nl[xi0]; + const float v1 = kvalues_iq4nl[xi1]; + const float w0 = data_s[src_idx + 0 + j]*data_s[src_idx + 0 + j]; + const float w1 = data_s[src_idx + QUANT_K_IQ4_NL/2 + j]*data_s[src_idx + QUANT_K_IQ4_NL/2 + j]; + sumqx += w0*v0*data_s[src_idx + j] + w1*v1*data_s[src_idx + QUANT_K_IQ4_NL/2 + j]; + sumq2 += w0*v0*v0 + w1*v1*v1; + } + + data_q[dst_idx].d = float16_t(sumq2 > 0 ? sumqx/sumq2 : d); + +} +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); + if (gl_LocalInvocationIndex.x != 0) { + return; + } +#endif + + const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K; + + if (idx >= p.ne) { + return; + } + + uint dst_idx = dst_idx_quant(idx, QUANT_K); + uint src_idx = get_aoffset() + src0_idx(idx); + + quantize(dst_idx, src_idx); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp index 68d1bc9f1..8dc9d360d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp @@ -54,3 +54,23 @@ uint dst_idx(uint idx) { const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } + +uint src0_idx_quant(uint idx, uint qk) { + const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02_offset = i02*p.ne01*p.ne00; + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; + return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00; +} + +uint dst_idx_quant(uint idx, uint qk) { + const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12_offset = i12*p.ne11*p.ne10; + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; + return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 243839917..b7890ef1e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -417,6 +417,11 @@ void process_shaders() { string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { + string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + } + string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 0ed3e98e1..5cde8289f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3808,6 +3808,12 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows } } + for (ggml_type type_dst : {GGML_TYPE_F32}) { + for (ggml_type type_src : all_types) { + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + } + } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous From 7a689c415e2aecea1a5ae438542afeaf69815d52 Mon Sep 17 00:00:00 2001 From: musoles <135031143+musoles@users.noreply.github.com> Date: Fri, 17 Jan 2025 00:10:49 +0000 Subject: [PATCH 15/30] README : added kalavai to infrastructure list (#11216) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 413a16422..784669ce1 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server +- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale From 960ec65273a9554ff2510ba285a970289256ebfb Mon Sep 17 00:00:00 2001 From: David Renshaw Date: Fri, 17 Jan 2025 02:12:01 -0500 Subject: [PATCH 16/30] llama : fix deprecation message: vocabable -> vocab (#11269) --- include/llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index 352c3417e..be6802eef 100644 --- a/include/llama.h +++ b/include/llama.h @@ -961,7 +961,7 @@ extern "C" { LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab); LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab); - DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocabable_get_text instead"); + DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead"); DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead"); DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead"); DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead"); From a133566d34a1dd3693c504786963bf1b7b7d8c0e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 09:28:00 +0200 Subject: [PATCH 17/30] vocab : fix double-eos check (#11273) ggml-ci --- src/llama-vocab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 4969d2628..9a680aed4 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -439,7 +439,7 @@ struct llm_tokenizer_bpe_session { "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " "Are you sure this is what you want?\n", __FUNCTION__); } - if (vocab.get_add_bos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) { + if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) { LLAMA_LOG_WARN( "%s: Added a EOS token to the prompt as specified by the model but the prompt " "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " From 667d72846c06b2cf4f7c8a4265e210991a49706b Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Fri, 17 Jan 2025 10:57:09 +0200 Subject: [PATCH 18/30] rpc : early register backend devices (#11262) Early register RPC devices and do not propagate RPC specifics in the llama model structures. ref: #10609 --- common/arg.cpp | 27 +++++++++++++++++- common/common.cpp | 1 - common/common.h | 1 - examples/llama-bench/llama-bench.cpp | 37 ++++++++++++++++++++++--- ggml/include/ggml-backend.h | 2 ++ ggml/src/ggml-backend-impl.h | 1 - include/llama.h | 3 -- src/llama-model.cpp | 1 - src/llama-model.h | 2 -- src/llama.cpp | 41 ---------------------------- 10 files changed, 61 insertions(+), 55 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index dd10b6352..9069950eb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -376,6 +376,30 @@ static std::vector parse_device_list(const std::string & val return devices; } +static void add_rpc_devices(std::string servers) { + auto rpc_servers = string_split(servers, ','); + if (rpc_servers.empty()) { + throw std::invalid_argument("no RPC servers specified"); + } + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + throw std::invalid_argument("failed to find RPC backend"); + } + typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); + ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); + if (!ggml_backend_rpc_add_device_fn) { + throw std::invalid_argument("failed to find RPC device add function"); + } + for (const auto & server : rpc_servers) { + ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (dev) { + ggml_backend_device_register(dev); + } else { + throw std::invalid_argument("failed to register RPC device"); + } + } +} + bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { auto ctx_arg = common_params_parser_init(params, ex, print_usage); const common_params params_org = ctx_arg.params; // the example can modify the default params @@ -1385,7 +1409,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--rpc"}, "SERVERS", "comma separated list of RPC servers", [](common_params & params, const std::string & value) { - params.rpc_servers = value; + add_rpc_devices(value); + GGML_UNUSED(params); } ).set_env("LLAMA_ARG_RPC")); } diff --git a/common/common.cpp b/common/common.cpp index a6f9252b2..451826d5d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1043,7 +1043,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } - mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; diff --git a/common/common.h b/common/common.h index 4fab1319a..691141d6b 100644 --- a/common/common.h +++ b/common/common.h @@ -246,7 +246,6 @@ struct common_params { std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT - std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a3b4c5ac8..4ac19ca86 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -683,7 +683,7 @@ struct cmd_params_instance { bool cpu_strict; int poll; int n_gpu_layers; - std::string rpc_servers; + std::string rpc_servers_str; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -696,8 +696,37 @@ struct cmd_params_instance { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; - if (!rpc_servers.empty()) { - mparams.rpc_servers = rpc_servers.c_str(); + if (!rpc_servers_str.empty()) { + auto rpc_servers = string_split(rpc_servers_str, ','); + + // add RPC devices + if (!rpc_servers.empty()) { + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + fprintf(stderr, "%s: failed to find RPC backend\n", __func__); + exit(1); + } + + typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); + ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); + if (!ggml_backend_rpc_add_device_fn) { + fprintf(stderr, "%s: failed to find RPC device add function\n", __func__); + exit(1); + } + static std::vector devices; + devices.clear(); + for (const std::string & server : rpc_servers) { + ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (dev) { + devices.push_back(dev); + } else { + fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); + exit(1); + } + } + devices.push_back(nullptr); + mparams.devices = devices.data(); + } } mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; @@ -708,7 +737,7 @@ struct cmd_params_instance { } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers && + return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split; } diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 7221a0830..fc9571c82 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -203,6 +203,8 @@ extern "C" { // Backend registry // + GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); + // Backend (reg) enumeration GGML_API size_t ggml_backend_reg_count(void); GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 36d72e95f..d1c2d76d8 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -208,7 +208,6 @@ extern "C" { // Internal backend registry API GGML_API void ggml_backend_register(ggml_backend_reg_t reg); - GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); // Add backend dynamic loading support to the backend diff --git a/include/llama.h b/include/llama.h index be6802eef..298b8d1bc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -288,9 +288,6 @@ extern "C" { // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; - // comma separated list of RPC servers to use for offloading - const char * rpc_servers; - // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. // If it returns false, model loading is immediately aborted. diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f90f5e746..c2d23a8d3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3717,7 +3717,6 @@ struct llama_model_params llama_model_default_params() { /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, - /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, diff --git a/src/llama-model.h b/src/llama-model.h index 4cc8abb75..a7c304447 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -323,8 +323,6 @@ struct llama_model { // gguf metadata std::unordered_map gguf_kv; - std::vector rpc_servers; - // list of devices used in this model std::vector devices; diff --git a/src/llama.cpp b/src/llama.cpp index fede23d19..e8cfe5012 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9399,47 +9399,6 @@ static struct llama_model * llama_model_load_from_file_impl( }; } - if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { - // split the servers set them into model->rpc_servers - std::string servers(params.rpc_servers); - size_t pos = 0; - while ((pos = servers.find(',')) != std::string::npos) { - std::string server = servers.substr(0, pos); - model->rpc_servers.push_back(server); - servers.erase(0, pos + 1); - } - model->rpc_servers.push_back(servers); - } - - // add RPC devices - if (!model->rpc_servers.empty()) { - ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); - if (!rpc_reg) { - LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__); - llama_model_free(model); - return nullptr; - } - - typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); - ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); - if (!ggml_backend_rpc_add_device_fn) { - LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__); - llama_model_free(model); - return nullptr; - } - - for (const std::string & server : model->rpc_servers) { - ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); - if (dev) { - model->devices.push_back(dev); - } else { - LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); - llama_model_free(model); - return nullptr; - } - } - } - // create list of devices to use with this model if (params.devices) { for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { From 3edfa7d3753c29e44b964c0ff424d2ea8d5fdee6 Mon Sep 17 00:00:00 2001 From: codezjx Date: Fri, 17 Jan 2025 20:57:56 +0800 Subject: [PATCH 19/30] llama.android: add field formatChat to control whether to parse special tokens when send message (#11270) --- examples/llama.android/llama/src/main/cpp/llama-android.cpp | 6 ++++-- .../llama/src/main/java/android/llama/cpp/LLamaAndroid.kt | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 99b14961d..2a73983a9 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -347,6 +347,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( jlong context_pointer, jlong batch_pointer, jstring jtext, + jboolean format_chat, jint n_len ) { @@ -356,7 +357,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( const auto context = reinterpret_cast(context_pointer); const auto batch = reinterpret_cast(batch_pointer); - const auto tokens_list = common_tokenize(context, text, 1); + bool parse_special = (format_chat == JNI_TRUE); + const auto tokens_list = common_tokenize(context, text, true, parse_special); auto n_ctx = llama_n_ctx(context); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); @@ -368,7 +370,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( } for (auto id : tokens_list) { - LOGi("%s", common_token_to_piece(context, id).c_str()); + LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id); } common_batch_clear(*batch); diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt index cf520e459..b964d93e3 100644 --- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt @@ -65,6 +65,7 @@ class LLamaAndroid { context: Long, batch: Long, text: String, + formatChat: Boolean, nLen: Int ): Int @@ -115,10 +116,10 @@ class LLamaAndroid { } } - fun send(message: String): Flow = flow { + fun send(message: String, formatChat: Boolean = false): Flow = flow { when (val state = threadLocalState.get()) { is State.Loaded -> { - val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) + val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen)) while (ncur.value <= nlen) { val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur) if (str == null) { From 44e18ef93995f3040660750b527e5becf85899d0 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 18 Jan 2025 02:26:50 -0600 Subject: [PATCH 20/30] vulkan: fix coopmat2 flash attention for non-contiguous inputs (#11281) Add code similar to mul_mm_cm2 to force alignment of strides, to avoid a performance regression. Add noncontiguous FA tests in test-backend-ops. Fixes #11268. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 43 ++++++++++++++++--- .../vulkan-shaders/flash_attn_cm2.comp | 20 +++++++++ tests/test-backend-ops.cpp | 31 ++++++++++--- 3 files changed, 82 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 8e3e91495..437e9cdcc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -386,10 +386,13 @@ struct vk_flash_attn_push_constants { uint32_t nev3; uint32_t nem1; + uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t nb21; uint32_t nb22; uint32_t nb23; uint32_t nb31; @@ -4809,7 +4812,14 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } assert(pipelines); - bool aligned = (KV % pipelines[1]->align) == 0; + const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type)); + const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type)); + const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type)); + + bool aligned = (KV % pipelines[1]->align) == 0 && + // the "aligned" shader variant will forcibly align strides, for performance + (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0; + vk_pipeline pipeline = pipelines[aligned]; assert(pipeline); @@ -4845,15 +4855,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx if (ctx->device->uma) { ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset); - ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset); - ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset); - ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset); + ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset); + ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset); + ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset); Q_uma = d_Q != nullptr; K_uma = d_K != nullptr; V_uma = d_V != nullptr; D_uma = d_D != nullptr; if (mask) { - ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset); + ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset); M_uma = d_M != nullptr; } } @@ -4891,7 +4901,18 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } } - const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 }; + const vk_flash_attn_push_constants pc = { N, KV, + (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, + (uint32_t)neq2, (uint32_t)neq3, + (uint32_t)nek2, (uint32_t)nek3, + (uint32_t)nev2, (uint32_t)nev3, + nem1, + q_stride, (uint32_t)nbq2, (uint32_t)nbq3, + k_stride, (uint32_t)nbk2, (uint32_t)nbk3, + v_stride, (uint32_t)nbv2, (uint32_t)nbv3, + nbm1, + scale, max_bias, logit_softcap, + mask != nullptr, n_head_log2, m0, m1 }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, @@ -8668,6 +8689,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; ggml_tensor * src2 = tensor->src[2]; + ggml_tensor * src3 = tensor->src[3]; void * tensor_data = tensor->data; @@ -8730,6 +8752,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { if (src2 != nullptr) { std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; } + if (src3 != nullptr) { + std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; + } std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3); @@ -8774,6 +8799,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { if (src2 != nullptr) { std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; } + if (src3 != nullptr) { + std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; + } std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); @@ -8796,6 +8824,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { if (src2 != nullptr) { std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl; } + if (src3 != nullptr) { + std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl; + } std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index c5be8131b..ca3a59b8f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -42,10 +42,13 @@ layout (push_constant) uniform parameter { uint32_t nev3; uint32_t nem1; + uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t nb21; uint32_t nb22; uint32_t nb23; uint32_t nb31; @@ -146,6 +149,23 @@ void main() { tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D); tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D); + // nb?1 are already divided by the type size and are in units of elements + uint32_t q_stride = p.nb01; + uint32_t k_stride = p.nb11; + uint32_t v_stride = p.nb21; + // hint to the compiler that strides are aligned for the aligned variant of the shader + if (Clamp != gl_CooperativeMatrixClampModeConstantNV) + { + q_stride &= ~7; +#if !defined(BLOCK_SIZE) + k_stride &= ~7; + v_stride &= ~7; +#endif + } + tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1); + tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1); + tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1); + coopmat Q; coopmat Qf16; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5cde8289f..74d1bee39 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3046,9 +3046,10 @@ struct test_flash_attn_ext : public test_case { const float logit_softcap; // Gemma 2 const ggml_type type_KV; + std::array permute; std::string vars() override { - return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV); + return VARS_TO_STR9(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV, permute); } double max_nmse_err() override { @@ -3063,19 +3064,33 @@ struct test_flash_attn_ext : public test_case { } test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, - bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16) - : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {} + bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16, + std::array permute = {0, 1, 2, 3}) + : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV), permute(permute) {} ggml_tensor * build_graph(ggml_context * ctx) override { const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV)); - ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1); + auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * { + int64_t ne[4] = {ne0, ne1, ne2, ne3}; + int64_t ne_perm[4]; + for (int i = 0; i < 4; ++i) { + ne_perm[permute[i]] = ne[i]; + } + ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]); + if (permute != std::array{0, 1, 2, 3}) { + t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]); + } + return t; + }; + + ggml_tensor * q = create_permuted(GGML_TYPE_F32, hs_padded, nb, nh, 1); ggml_set_name(q, "q"); - ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); + ggml_tensor * k = create_permuted(type_KV, hs_padded, kv, nh, 1); ggml_set_name(k, "k"); - ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); + ggml_tensor * v = create_permuted(type_KV, hs_padded, kv, nh, 1); ggml_set_name(v, "v"); ggml_tensor * m = nullptr; @@ -4167,6 +4182,10 @@ static std::vector> make_test_cases_eval() { for (int nb : { 1, 3, 32, 35, }) { for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV)); + // run fewer test cases permuted + if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) { + test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV, {0, 2, 1, 3})); + } } } } From 6390a998bfc63241fde8509022b0768a71bf20bb Mon Sep 17 00:00:00 2001 From: LostRuins Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 18 Jan 2025 18:20:57 +0800 Subject: [PATCH 21/30] tts : add guide tokens support (#11186) * Added the ability to use guide tokens for OuteTTS, greatly improving TTS recitation accuracy over long input sequences. * applied linting suggestions, updated to latest llama_vocab changes, added a safety check, added newline to guide token start --- common/arg.cpp | 7 +++++++ common/common.h | 2 ++ examples/tts/tts.cpp | 45 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 9069950eb..dede335fb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2254,6 +2254,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.vocoder.model = value; } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--tts-use-guide-tokens"}, + "Use guide tokens to improve TTS word recall", + [](common_params & params) { + params.vocoder.use_guide_tokens = true; + } + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); // model-specific add_opt(common_arg( diff --git a/common/common.h b/common/common.h index 691141d6b..3bcc637cc 100644 --- a/common/common.h +++ b/common/common.h @@ -184,6 +184,8 @@ struct common_params_vocoder { std::string model = ""; // model path // NOLINT std::string model_url = ""; // model url to download // NOLINT + + bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT }; struct common_params { diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 5a9161181..f78f76303 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -425,6 +425,33 @@ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) { prompt_add(prompt, vocab, "<|im_start|>\n", true, true); } +static std::vector prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) { + const std::string& delimiter = "<|text_sep|>"; + + std::vector result; + size_t start = 0; + size_t end = str.find(delimiter); + + //first token is always a newline, as it was not previously added + result.push_back(common_tokenize(vocab, "\n", false, true)[0]); + + while (end != std::string::npos) { + std::string current_word = str.substr(start, end - start); + auto tmp = common_tokenize(vocab, current_word, false, true); + result.push_back(tmp[0]); + start = end + delimiter.length(); + end = str.find(delimiter, start); + } + + // Add the last part + std::string current_word = str.substr(start); + auto tmp = common_tokenize(vocab, current_word, false, true); + if (tmp.size() > 0) { + result.push_back(tmp[0]); + } + return result; +} + int main(int argc, char ** argv) { common_params params; @@ -494,6 +521,7 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); std::vector codes; + std::vector guide_tokens; // process prompt and generate voice codes { @@ -508,6 +536,9 @@ int main(int argc, char ** argv) { // convert the input text into the necessary format expected by OuteTTS { std::string prompt_clean = process_text(params.prompt); + if (params.vocoder.use_guide_tokens) { + guide_tokens = prepare_guide_tokens(vocab, prompt_clean); + } LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str()); @@ -717,6 +748,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 int n_past = batch.n_tokens; int n_decode = 0; + bool next_token_uses_guide_token = true; + while (n_decode <= n_predict) { // prepare the next batch common_batch_clear(batch); @@ -728,7 +761,17 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 continue; } - const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]); + llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]); + + //guide tokens help prevent hallucinations by forcing the TTS to use the correct word + if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) { + llama_token guide_token = guide_tokens[0]; + guide_tokens.erase(guide_tokens.begin()); + new_token_id = guide_token; //ensure correct word fragment is used + } + + //this is the token id that always precedes a new word + next_token_uses_guide_token = (new_token_id == 198); common_sampler_accept(smpl[i], new_token_id, true); From f26c87417999209e7f4576b4f3ecf7a5b9c66a29 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 18 Jan 2025 13:18:32 +0200 Subject: [PATCH 22/30] scripts : restore hf.sh (#11288) ggml-ci --- scripts/hf.sh | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100755 scripts/hf.sh diff --git a/scripts/hf.sh b/scripts/hf.sh new file mode 100755 index 000000000..b251925fa --- /dev/null +++ b/scripts/hf.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# Shortcut for downloading HF models +# +# Usage: +# ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) +# + +# all logs go to stderr +function log { + echo "$@" 1>&2 +} + +function usage { + log "Usage: $0 [[--url] ] [--repo ] [--file ] [--outdir [-h|--help]" + exit 1 +} + +# check for curl or wget +function has_cmd { + if ! [ -x "$(command -v $1)" ]; then + return 1 + fi +} + +if has_cmd wget; then + cmd="wget -q -c -O %s/%s %s" +elif has_cmd curl; then + cmd="curl -C - -f --output-dir %s -o %s -L %s" +else + log "[E] curl or wget not found" + exit 1 +fi + +url="" +repo="" +file="" +outdir="." + +# parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --url) + url="$2" + shift 2 + ;; + --repo) + repo="$2" + shift 2 + ;; + --file) + file="$2" + shift 2 + ;; + --outdir) + outdir="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + url="$1" + shift + ;; + esac +done + +if [ -n "$repo" ] && [ -n "$file" ]; then + url="https://huggingface.co/$repo/resolve/main/$file" +fi + +if [ -z "$url" ]; then + log "[E] missing --url" + usage +fi + +# check if the URL is a HuggingFace model, and if so, try to download it +is_url=false + +if [[ ${#url} -gt 22 ]]; then + if [[ ${url:0:22} == "https://huggingface.co" ]]; then + is_url=true + fi +fi + +if [ "$is_url" = false ]; then + log "[E] invalid URL, must start with https://huggingface.co" + exit 0 +fi + +# replace "blob/main" with "resolve/main" +url=${url/blob\/main/resolve\/main} + +basename=$(basename $url) + +log "[+] attempting to download $basename" + +if [ -n "$cmd" ]; then + cmd=$(printf "$cmd" "$outdir" "$basename" "$url") + log "[+] $cmd" + if $cmd; then + echo $outdir/$basename + exit 0 + fi +fi + +log "[-] failed to download" + +exit 1 From f30f099228f774209aa3010b78dfbe5d262e69aa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 18 Jan 2025 14:12:05 +0100 Subject: [PATCH 23/30] server : implement cancellable request (#11285) * server : implement cancellable request * fix typo * httplib 0.18.5 * fix i underflow --- examples/server/httplib.h | 1726 +++++++++++++---- examples/server/server.cpp | 73 +- examples/server/tests/unit/test_completion.py | 21 + examples/server/tests/utils.py | 7 +- 4 files changed, 1396 insertions(+), 431 deletions(-) diff --git a/examples/server/httplib.h b/examples/server/httplib.h index f360bd93e..c2f12dd2a 100644 --- a/examples/server/httplib.h +++ b/examples/server/httplib.h @@ -8,7 +8,7 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.15.3" +#define CPPHTTPLIB_VERSION "0.18.5" /* * Configuration @@ -18,8 +18,12 @@ #define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 #endif +#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND +#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND 10000 +#endif + #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT -#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 +#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 100 #endif #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND @@ -30,20 +34,36 @@ #define CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND 0 #endif -#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND -#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5 +#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND +#define CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND 5 #endif -#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND -#define CPPHTTPLIB_READ_TIMEOUT_USECOND 0 +#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND +#define CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND 0 #endif -#ifndef CPPHTTPLIB_WRITE_TIMEOUT_SECOND -#define CPPHTTPLIB_WRITE_TIMEOUT_SECOND 5 +#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND +#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND 5 #endif -#ifndef CPPHTTPLIB_WRITE_TIMEOUT_USECOND -#define CPPHTTPLIB_WRITE_TIMEOUT_USECOND 0 +#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND +#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND 0 +#endif + +#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND +#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND 300 +#endif + +#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND +#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND 0 +#endif + +#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND +#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND 5 +#endif + +#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND +#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND 0 #endif #ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND @@ -90,8 +110,12 @@ #define CPPHTTPLIB_TCP_NODELAY false #endif +#ifndef CPPHTTPLIB_IPV6_V6ONLY +#define CPPHTTPLIB_IPV6_V6ONLY false +#endif + #ifndef CPPHTTPLIB_RECV_BUFSIZ -#define CPPHTTPLIB_RECV_BUFSIZ size_t(4096u) +#define CPPHTTPLIB_RECV_BUFSIZ size_t(16384u) #endif #ifndef CPPHTTPLIB_COMPRESSION_BUFSIZ @@ -145,11 +169,11 @@ using ssize_t = long; #endif // _MSC_VER #ifndef S_ISREG -#define S_ISREG(m) (((m)&S_IFREG) == S_IFREG) +#define S_ISREG(m) (((m) & S_IFREG) == S_IFREG) #endif // S_ISREG #ifndef S_ISDIR -#define S_ISDIR(m) (((m)&S_IFDIR) == S_IFDIR) +#define S_ISDIR(m) (((m) & S_IFDIR) == S_IFDIR) #endif // S_ISDIR #ifndef NOMINMAX @@ -269,7 +293,12 @@ using socket_t = int; #include #include -#if OPENSSL_VERSION_NUMBER < 0x30000000L +#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER) +#if OPENSSL_VERSION_NUMBER < 0x1010107f +#error Please use OpenSSL or a current version of BoringSSL +#endif +#define SSL_get1_peer_certificate SSL_get_peer_certificate +#elif OPENSSL_VERSION_NUMBER < 0x30000000L #error Sorry, OpenSSL versions prior to 3.0.0 are not supported #endif @@ -312,16 +341,63 @@ make_unique(std::size_t n) { return std::unique_ptr(new RT[n]); } -struct ci { - bool operator()(const std::string &s1, const std::string &s2) const { - return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), - s2.end(), - [](unsigned char c1, unsigned char c2) { - return ::tolower(c1) < ::tolower(c2); - }); +namespace case_ignore { + +inline unsigned char to_lower(int c) { + const static unsigned char table[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 224, 225, 226, + 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, + 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, + }; + return table[(unsigned char)(char)c]; +} + +inline bool equal(const std::string &a, const std::string &b) { + return a.size() == b.size() && + std::equal(a.begin(), a.end(), b.begin(), [](char ca, char cb) { + return to_lower(ca) == to_lower(cb); + }); +} + +struct equal_to { + bool operator()(const std::string &a, const std::string &b) const { + return equal(a, b); } }; +struct hash { + size_t operator()(const std::string &key) const { + return hash_core(key.data(), key.size(), 0); + } + + size_t hash_core(const char *s, size_t l, size_t h) const { + return (l == 0) ? h + : hash_core(s + 1, l - 1, + // Unsets the 6 high bits of h, therefore no + // overflow happens + (((std::numeric_limits::max)() >> 6) & + h * 33) ^ + static_cast(to_lower(*s))); + } +}; + +} // namespace case_ignore + // This is based on // "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189". @@ -427,7 +503,9 @@ enum StatusCode { NetworkAuthenticationRequired_511 = 511, }; -using Headers = std::multimap; +using Headers = + std::unordered_multimap; using Params = std::multimap; using Match = std::smatch; @@ -534,6 +612,7 @@ using Ranges = std::vector; struct Request { std::string method; std::string path; + Params params; Headers headers; std::string body; @@ -545,11 +624,11 @@ struct Request { // for server std::string version; std::string target; - Params params; MultipartFormDataMap files; Ranges ranges; Match matches; std::unordered_map path_params; + std::function is_connection_closed = []() { return true; }; // for client ResponseHandler response_handler; @@ -560,8 +639,10 @@ struct Request { #endif bool has_header(const std::string &key) const; - std::string get_header_value(const std::string &key, size_t id = 0) const; - uint64_t get_header_value_u64(const std::string &key, size_t id = 0) const; + std::string get_header_value(const std::string &key, const char *def = "", + size_t id = 0) const; + uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0, + size_t id = 0) const; size_t get_header_value_count(const std::string &key) const; void set_header(const std::string &key, const std::string &val); @@ -592,8 +673,10 @@ struct Response { std::string location; // Redirect location bool has_header(const std::string &key) const; - std::string get_header_value(const std::string &key, size_t id = 0) const; - uint64_t get_header_value_u64(const std::string &key, size_t id = 0) const; + std::string get_header_value(const std::string &key, const char *def = "", + size_t id = 0) const; + uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0, + size_t id = 0) const; size_t get_header_value_count(const std::string &key) const; void set_header(const std::string &key, const std::string &val); @@ -614,6 +697,10 @@ struct Response { const std::string &content_type, ContentProviderWithoutLength provider, ContentProviderResourceReleaser resource_releaser = nullptr); + void set_file_content(const std::string &path, + const std::string &content_type); + void set_file_content(const std::string &path); + Response() = default; Response(const Response &) = default; Response &operator=(const Response &) = default; @@ -631,6 +718,8 @@ struct Response { ContentProviderResourceReleaser content_provider_resource_releaser_; bool is_chunked_content_provider_ = false; bool content_provider_success_ = false; + std::string file_content_path_; + std::string file_content_content_type_; }; class Stream { @@ -646,8 +735,6 @@ public: virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0; virtual socket_t socket() const = 0; - template - ssize_t write_format(const char *fmt, const Args &...args); ssize_t write(const char *ptr); ssize_t write(const std::string &s); }; @@ -719,13 +806,18 @@ private: if (pool_.shutdown_ && pool_.jobs_.empty()) { break; } - fn = std::move(pool_.jobs_.front()); + fn = pool_.jobs_.front(); pool_.jobs_.pop_front(); } assert(true == static_cast(fn)); fn(); } + +#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) && \ + !defined(LIBRESSL_VERSION_NUMBER) + OPENSSL_thread_stop(); +#endif } ThreadPool &pool_; @@ -787,7 +879,6 @@ public: bool match(Request &request) const override; private: - static constexpr char marker = ':'; // Treat segment separators as the end of path parameter capture // Does not need to handle query parameters as they are parsed before path // matching @@ -871,8 +962,13 @@ public: Server &set_default_file_mimetype(const std::string &mime); Server &set_file_request_handler(Handler handler); - Server &set_error_handler(HandlerWithResponse handler); - Server &set_error_handler(Handler handler); + template + Server &set_error_handler(ErrorHandlerFunc &&handler) { + return set_error_handler_core( + std::forward(handler), + std::is_convertible{}); + } + Server &set_exception_handler(ExceptionHandler handler); Server &set_pre_routing_handler(HandlerWithResponse handler); Server &set_post_routing_handler(Handler handler); @@ -882,6 +978,7 @@ public: Server &set_address_family(int family); Server &set_tcp_nodelay(bool on); + Server &set_ipv6_v6only(bool on); Server &set_socket_options(SocketOptions socket_options); Server &set_default_headers(Headers headers); @@ -914,21 +1011,24 @@ public: bool is_running() const; void wait_until_ready() const; void stop(); + void decommission(); std::function new_task_queue; protected: - bool process_request(Stream &strm, bool close_connection, + bool process_request(Stream &strm, const std::string &remote_addr, + int remote_port, const std::string &local_addr, + int local_port, bool close_connection, bool &connection_closed, const std::function &setup_request); std::atomic svr_sock_{INVALID_SOCKET}; size_t keep_alive_max_count_ = CPPHTTPLIB_KEEPALIVE_MAX_COUNT; time_t keep_alive_timeout_sec_ = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND; - time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND; - time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND; - time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND; - time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND; + time_t read_timeout_sec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND; + time_t read_timeout_usec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND; + time_t write_timeout_sec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND; + time_t write_timeout_usec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND; time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND; time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND; size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH; @@ -943,6 +1043,9 @@ private: static std::unique_ptr make_matcher(const std::string &pattern); + Server &set_error_handler_core(HandlerWithResponse handler, std::true_type); + Server &set_error_handler_core(Handler handler, std::false_type); + socket_t create_server_socket(const std::string &host, int port, int socket_flags, SocketOptions socket_options) const; @@ -985,7 +1088,7 @@ private: virtual bool process_and_close_socket(socket_t sock); std::atomic is_running_{false}; - std::atomic done_{false}; + std::atomic is_decommisioned{false}; struct MountPointEntry { std::string mount_point; @@ -1018,6 +1121,7 @@ private: int address_family_ = AF_UNSPEC; bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; + bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; SocketOptions socket_options_ = default_socket_options; Headers default_headers_; @@ -1037,6 +1141,7 @@ enum class Error { SSLConnection, SSLLoadingCerts, SSLServerVerification, + SSLServerHostnameVerification, UnsupportedMultipartBoundaryChars, Compression, ConnectionTimeout, @@ -1074,9 +1179,10 @@ public: // Request Headers bool has_request_header(const std::string &key) const; std::string get_request_header_value(const std::string &key, + const char *def = "", size_t id = 0) const; uint64_t get_request_header_value_u64(const std::string &key, - size_t id = 0) const; + uint64_t def = 0, size_t id = 0) const; size_t get_request_header_value_count(const std::string &key) const; private: @@ -1140,10 +1246,18 @@ public: const std::string &content_type); Result Post(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Post(const std::string &path, const Headers &headers, const char *body, + size_t content_length, const std::string &content_type, + Progress progress); Result Post(const std::string &path, const std::string &body, const std::string &content_type); + Result Post(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Post(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Post(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1159,6 +1273,8 @@ public: Result Post(const std::string &path, const Params ¶ms); Result Post(const std::string &path, const Headers &headers, const Params ¶ms); + Result Post(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress); Result Post(const std::string &path, const MultipartFormDataItems &items); Result Post(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1173,10 +1289,18 @@ public: const std::string &content_type); Result Put(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Put(const std::string &path, const Headers &headers, const char *body, + size_t content_length, const std::string &content_type, + Progress progress); Result Put(const std::string &path, const std::string &body, const std::string &content_type); + Result Put(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Put(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Put(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); Result Put(const std::string &path, @@ -1191,6 +1315,8 @@ public: Result Put(const std::string &path, const Params ¶ms); Result Put(const std::string &path, const Headers &headers, const Params ¶ms); + Result Put(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress); Result Put(const std::string &path, const MultipartFormDataItems &items); Result Put(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1203,13 +1329,23 @@ public: Result Patch(const std::string &path); Result Patch(const std::string &path, const char *body, size_t content_length, const std::string &content_type); + Result Patch(const std::string &path, const char *body, size_t content_length, + const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Patch(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, Progress progress); Result Patch(const std::string &path, const std::string &body, const std::string &content_type); + Result Patch(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Patch(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Patch(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1227,13 +1363,24 @@ public: Result Delete(const std::string &path, const Headers &headers); Result Delete(const std::string &path, const char *body, size_t content_length, const std::string &content_type); + Result Delete(const std::string &path, const char *body, + size_t content_length, const std::string &content_type, + Progress progress); Result Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Delete(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, Progress progress); Result Delete(const std::string &path, const std::string &body, const std::string &content_type); + Result Delete(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Delete(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Delete(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Options(const std::string &path); Result Options(const std::string &path, const Headers &headers); @@ -1258,6 +1405,7 @@ public: void set_address_family(int family); void set_tcp_nodelay(bool on); + void set_ipv6_v6only(bool on); void set_socket_options(SocketOptions socket_options); void set_connection_timeout(time_t sec, time_t usec = 0); @@ -1309,6 +1457,8 @@ public: #ifdef CPPHTTPLIB_OPENSSL_SUPPORT void enable_server_certificate_verification(bool enabled); + void enable_server_hostname_verification(bool enabled); + void set_server_certificate_verifier(std::function verifier); #endif void set_logger(Logger logger); @@ -1375,10 +1525,10 @@ protected: time_t connection_timeout_sec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND; time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND; - time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND; - time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND; - time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND; - time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND; + time_t read_timeout_sec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND; + time_t read_timeout_usec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND; + time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND; + time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND; std::string basic_auth_username_; std::string basic_auth_password_; @@ -1395,6 +1545,7 @@ protected: int address_family_ = AF_UNSPEC; bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; + bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; SocketOptions socket_options_ = nullptr; bool compress_ = false; @@ -1422,6 +1573,8 @@ protected: #ifdef CPPHTTPLIB_OPENSSL_SUPPORT bool server_certificate_verification_ = true; + bool server_hostname_verification_ = true; + std::function server_certificate_verifier_; #endif Logger logger_; @@ -1430,6 +1583,9 @@ private: bool send_(Request &req, Response &res, Error &error); Result send_(Request &&req); +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + bool is_ssl_peer_could_be_closed(SSL *ssl) const; +#endif socket_t create_client_socket(Error &error) const; bool read_response_line(Stream &strm, const Request &req, Response &res) const; @@ -1448,7 +1604,7 @@ private: const Headers &headers, const char *body, size_t content_length, ContentProvider content_provider, ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type); + const std::string &content_type, Progress progress); ContentProviderWithoutLength get_multipart_content_provider( const std::string &boundary, const MultipartFormDataItems &items, const MultipartFormDataProviderItems &provider_items) const; @@ -1477,6 +1633,7 @@ public: const std::string &client_key_path); Client(Client &&) = default; + Client &operator=(Client &&) = default; ~Client(); @@ -1523,10 +1680,18 @@ public: const std::string &content_type); Result Post(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Post(const std::string &path, const Headers &headers, const char *body, + size_t content_length, const std::string &content_type, + Progress progress); Result Post(const std::string &path, const std::string &body, const std::string &content_type); + Result Post(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Post(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Post(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1542,6 +1707,8 @@ public: Result Post(const std::string &path, const Params ¶ms); Result Post(const std::string &path, const Headers &headers, const Params ¶ms); + Result Post(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress); Result Post(const std::string &path, const MultipartFormDataItems &items); Result Post(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1556,10 +1723,18 @@ public: const std::string &content_type); Result Put(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Put(const std::string &path, const Headers &headers, const char *body, + size_t content_length, const std::string &content_type, + Progress progress); Result Put(const std::string &path, const std::string &body, const std::string &content_type); + Result Put(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Put(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Put(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); Result Put(const std::string &path, @@ -1574,6 +1749,8 @@ public: Result Put(const std::string &path, const Params ¶ms); Result Put(const std::string &path, const Headers &headers, const Params ¶ms); + Result Put(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress); Result Put(const std::string &path, const MultipartFormDataItems &items); Result Put(const std::string &path, const Headers &headers, const MultipartFormDataItems &items); @@ -1586,13 +1763,23 @@ public: Result Patch(const std::string &path); Result Patch(const std::string &path, const char *body, size_t content_length, const std::string &content_type); + Result Patch(const std::string &path, const char *body, size_t content_length, + const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Patch(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, Progress progress); Result Patch(const std::string &path, const std::string &body, const std::string &content_type); + Result Patch(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Patch(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Patch(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type); @@ -1610,13 +1797,24 @@ public: Result Delete(const std::string &path, const Headers &headers); Result Delete(const std::string &path, const char *body, size_t content_length, const std::string &content_type); + Result Delete(const std::string &path, const char *body, + size_t content_length, const std::string &content_type, + Progress progress); Result Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type); + Result Delete(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, Progress progress); Result Delete(const std::string &path, const std::string &body, const std::string &content_type); + Result Delete(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress); Result Delete(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type); + Result Delete(const std::string &path, const Headers &headers, + const std::string &body, const std::string &content_type, + Progress progress); Result Options(const std::string &path); Result Options(const std::string &path, const Headers &headers); @@ -1685,6 +1883,8 @@ public: #ifdef CPPHTTPLIB_OPENSSL_SUPPORT void enable_server_certificate_verification(bool enabled); + void enable_server_hostname_verification(bool enabled); + void set_server_certificate_verifier(std::function verifier); #endif void set_logger(Logger logger); @@ -1730,6 +1930,9 @@ public: SSL_CTX *ssl_context() const; + void update_certs(X509 *cert, EVP_PKEY *private_key, + X509_STORE *client_ca_cert_store = nullptr); + private: bool process_and_close_socket(socket_t sock) override; @@ -1810,68 +2013,58 @@ inline void duration_to_sec_and_usec(const T &duration, U callback) { callback(static_cast(sec), static_cast(usec)); } +inline bool is_numeric(const std::string &str) { + return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit); +} + inline uint64_t get_header_value_u64(const Headers &headers, - const std::string &key, size_t id, - uint64_t def) { + const std::string &key, uint64_t def, + size_t id, bool &is_invalid_value) { + is_invalid_value = false; auto rng = headers.equal_range(key); auto it = rng.first; std::advance(it, static_cast(id)); if (it != rng.second) { - return std::strtoull(it->second.data(), nullptr, 10); + if (is_numeric(it->second)) { + return std::strtoull(it->second.data(), nullptr, 10); + } else { + is_invalid_value = true; + } } return def; } +inline uint64_t get_header_value_u64(const Headers &headers, + const std::string &key, uint64_t def, + size_t id) { + bool dummy = false; + return get_header_value_u64(headers, key, def, id, dummy); +} + } // namespace detail inline uint64_t Request::get_header_value_u64(const std::string &key, - size_t id) const { - return detail::get_header_value_u64(headers, key, id, 0); + uint64_t def, size_t id) const { + return detail::get_header_value_u64(headers, key, def, id); } inline uint64_t Response::get_header_value_u64(const std::string &key, - size_t id) const { - return detail::get_header_value_u64(headers, key, id, 0); -} - -template -inline ssize_t Stream::write_format(const char *fmt, const Args &...args) { - const auto bufsiz = 2048; - std::array buf{}; - - auto sn = snprintf(buf.data(), buf.size() - 1, fmt, args...); - if (sn <= 0) { return sn; } - - auto n = static_cast(sn); - - if (n >= buf.size() - 1) { - std::vector glowable_buf(buf.size()); - - while (n >= glowable_buf.size() - 1) { - glowable_buf.resize(glowable_buf.size() * 2); - n = static_cast( - snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...)); - } - return write(&glowable_buf[0], n); - } else { - return write(buf.data(), n); - } + uint64_t def, size_t id) const { + return detail::get_header_value_u64(headers, key, def, id); } inline void default_socket_options(socket_t sock) { - int yes = 1; + int opt = 1; #ifdef _WIN32 setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - reinterpret_cast(&yes), sizeof(yes)); - setsockopt(sock, SOL_SOCKET, SO_EXCLUSIVEADDRUSE, - reinterpret_cast(&yes), sizeof(yes)); + reinterpret_cast(&opt), sizeof(opt)); #else #ifdef SO_REUSEPORT setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - reinterpret_cast(&yes), sizeof(yes)); + reinterpret_cast(&opt), sizeof(opt)); #else setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - reinterpret_cast(&yes), sizeof(yes)); + reinterpret_cast(&opt), sizeof(opt)); #endif #endif } @@ -1997,6 +2190,8 @@ inline std::string to_string(const Error error) { case Error::SSLConnection: return "SSL connection failed"; case Error::SSLLoadingCerts: return "SSL certificate loading failed"; case Error::SSLServerVerification: return "SSL server verification failed"; + case Error::SSLServerHostnameVerification: + return "SSL server hostname verification failed"; case Error::UnsupportedMultipartBoundaryChars: return "Unsupported HTTP multipart boundary characters"; case Error::Compression: return "Compression failed"; @@ -2016,8 +2211,9 @@ inline std::ostream &operator<<(std::ostream &os, const Error &obj) { } inline uint64_t Result::get_request_header_value_u64(const std::string &key, + uint64_t def, size_t id) const { - return detail::get_header_value_u64(request_headers_, key, id, 0); + return detail::get_header_value_u64(request_headers_, key, def, id); } template @@ -2080,6 +2276,36 @@ make_basic_authentication_header(const std::string &username, namespace detail { +#if defined(_WIN32) +inline std::wstring u8string_to_wstring(const char *s) { + std::wstring ws; + auto len = static_cast(strlen(s)); + auto wlen = ::MultiByteToWideChar(CP_UTF8, 0, s, len, nullptr, 0); + if (wlen > 0) { + ws.resize(wlen); + wlen = ::MultiByteToWideChar( + CP_UTF8, 0, s, len, + const_cast(reinterpret_cast(ws.data())), wlen); + if (wlen != static_cast(ws.size())) { ws.clear(); } + } + return ws; +} +#endif + +struct FileStat { + FileStat(const std::string &path); + bool is_file() const; + bool is_dir() const; + +private: +#if defined(_WIN32) + struct _stat st_; +#else + struct stat st_; +#endif + int ret_ = -1; +}; + std::string encode_query_param(const std::string &value); std::string decode_url(const std::string &s, bool convert_plus_to_space); @@ -2088,6 +2314,16 @@ void read_file(const std::string &path, std::string &out); std::string trim_copy(const std::string &s); +void divide( + const char *data, std::size_t size, char d, + std::function + fn); + +void divide( + const std::string &str, char d, + std::function + fn); + void split(const char *b, const char *e, char d, std::function fn); @@ -2099,18 +2335,23 @@ bool process_client_socket(socket_t sock, time_t read_timeout_sec, time_t write_timeout_usec, std::function callback); -socket_t create_client_socket( - const std::string &host, const std::string &ip, int port, - int address_family, bool tcp_nodelay, SocketOptions socket_options, - time_t connection_timeout_sec, time_t connection_timeout_usec, - time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, const std::string &intf, Error &error); +socket_t create_client_socket(const std::string &host, const std::string &ip, + int port, int address_family, bool tcp_nodelay, + bool ipv6_v6only, SocketOptions socket_options, + time_t connection_timeout_sec, + time_t connection_timeout_usec, + time_t read_timeout_sec, time_t read_timeout_usec, + time_t write_timeout_sec, + time_t write_timeout_usec, + const std::string &intf, Error &error); const char *get_header_value(const Headers &headers, const std::string &key, - size_t id = 0, const char *def = nullptr); + const char *def, size_t id); std::string params_to_query_str(const Params ¶ms); +void parse_query_text(const char *data, std::size_t size, Params ¶ms); + void parse_query_text(const std::string &s, Params ¶ms); bool parse_multipart_boundary(const std::string &content_type, @@ -2270,15 +2511,70 @@ public: private: #if defined(_WIN32) - HANDLE hFile_; - HANDLE hMapping_; + HANDLE hFile_ = NULL; + HANDLE hMapping_ = NULL; #else - int fd_; + int fd_ = -1; #endif - size_t size_; - void *addr_; + size_t size_ = 0; + void *addr_ = nullptr; + bool is_open_empty_file = false; }; +// NOTE: https://www.rfc-editor.org/rfc/rfc9110#section-5 +namespace fields { + +inline bool is_token_char(char c) { + return std::isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' || + c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' || + c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~'; +} + +inline bool is_token(const std::string &s) { + if (s.empty()) { return false; } + for (auto c : s) { + if (!is_token_char(c)) { return false; } + } + return true; +} + +inline bool is_field_name(const std::string &s) { return is_token(s); } + +inline bool is_vchar(char c) { return c >= 33 && c <= 126; } + +inline bool is_obs_text(char c) { return 128 <= static_cast(c); } + +inline bool is_field_vchar(char c) { return is_vchar(c) || is_obs_text(c); } + +inline bool is_field_content(const std::string &s) { + if (s.empty()) { return false; } + + if (s.size() == 1) { + return is_field_vchar(s[0]); + } else if (s.size() == 2) { + return is_field_vchar(s[0]) && is_field_vchar(s[1]); + } else { + size_t i = 0; + + if (!is_field_vchar(s[i])) { return false; } + i++; + + while (i < s.size() - 1) { + auto c = s[i++]; + if (c == ' ' || c == '\t' || is_field_vchar(c)) { + } else { + return false; + } + } + + return is_field_vchar(s[i]); + } +} + +inline bool is_field_value(const std::string &s) { return is_field_content(s); } + +} // namespace fields + } // namespace detail // ---------------------------------------------------------------------------- @@ -2392,20 +2688,6 @@ inline std::string base64_encode(const std::string &in) { return out; } -inline bool is_file(const std::string &path) { -#ifdef _WIN32 - return _access_s(path.c_str(), 0) == 0; -#else - struct stat st; - return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode); -#endif -} - -inline bool is_dir(const std::string &path) { - struct stat st; - return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode); -} - inline bool is_valid_path(const std::string &path) { size_t level = 0; size_t i = 0; @@ -2448,6 +2730,21 @@ inline bool is_valid_path(const std::string &path) { return true; } +inline FileStat::FileStat(const std::string &path) { +#if defined(_WIN32) + auto wpath = u8string_to_wstring(path.c_str()); + ret_ = _wstat(wpath.c_str(), &st_); +#else + ret_ = stat(path.c_str(), &st_); +#endif +} +inline bool FileStat::is_file() const { + return ret_ >= 0 && S_ISREG(st_.st_mode); +} +inline bool FileStat::is_dir() const { + return ret_ >= 0 && S_ISDIR(st_.st_mode); +} + inline std::string encode_query_param(const std::string &value) { std::ostringstream escaped; escaped.fill('0'); @@ -2579,6 +2876,27 @@ inline std::string trim_double_quotes_copy(const std::string &s) { return s; } +inline void +divide(const char *data, std::size_t size, char d, + std::function + fn) { + const auto it = std::find(data, data + size, d); + const auto found = static_cast(it != data + size); + const auto lhs_data = data; + const auto lhs_size = static_cast(it - data); + const auto rhs_data = it + found; + const auto rhs_size = size - lhs_size - found; + + fn(lhs_data, lhs_size, rhs_data, rhs_size); +} + +inline void +divide(const std::string &str, char d, + std::function + fn) { + divide(str.data(), str.size(), d, std::move(fn)); +} + inline void split(const char *b, const char *e, char d, std::function fn) { return split(b, e, d, (std::numeric_limits::max)(), std::move(fn)); @@ -2636,6 +2954,10 @@ inline bool stream_line_reader::getline() { fixed_buffer_used_size_ = 0; glowable_buffer_.clear(); +#ifndef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR + char prev_byte = 0; +#endif + for (size_t i = 0;; i++) { char byte; auto n = strm_.read(&byte, 1); @@ -2652,7 +2974,12 @@ inline bool stream_line_reader::getline() { append(byte); +#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR if (byte == '\n') { break; } +#else + if (prev_byte == '\r' && byte == '\n') { break; } + prev_byte = byte; +#endif } return true; @@ -2671,16 +2998,7 @@ inline void stream_line_reader::append(char c) { } } -inline mmap::mmap(const char *path) -#if defined(_WIN32) - : hFile_(NULL), hMapping_(NULL) -#else - : fd_(-1) -#endif - , - size_(0), addr_(nullptr) { - open(path); -} +inline mmap::mmap(const char *path) { open(path); } inline mmap::~mmap() { close(); } @@ -2688,29 +3006,60 @@ inline bool mmap::open(const char *path) { close(); #if defined(_WIN32) - std::wstring wpath; - for (size_t i = 0; i < strlen(path); i++) { - wpath += path[i]; - } + auto wpath = u8string_to_wstring(path); + if (wpath.empty()) { return false; } +#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL); +#else + hFile_ = ::CreateFileW(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); +#endif if (hFile_ == INVALID_HANDLE_VALUE) { return false; } LARGE_INTEGER size{}; if (!::GetFileSizeEx(hFile_, &size)) { return false; } + // If the following line doesn't compile due to QuadPart, update Windows SDK. + // See: + // https://github.com/yhirose/cpp-httplib/issues/1903#issuecomment-2316520721 + if (static_cast(size.QuadPart) > + (std::numeric_limits::max)()) { + // `size_t` might be 32-bits, on 32-bits Windows. + return false; + } size_ = static_cast(size.QuadPart); +#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 hMapping_ = ::CreateFileMappingFromApp(hFile_, NULL, PAGE_READONLY, size_, NULL); +#else + hMapping_ = ::CreateFileMappingW(hFile_, NULL, PAGE_READONLY, 0, 0, NULL); +#endif + + // Special treatment for an empty file... + if (hMapping_ == NULL && size_ == 0) { + close(); + is_open_empty_file = true; + return true; + } if (hMapping_ == NULL) { close(); return false; } +#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 addr_ = ::MapViewOfFileFromApp(hMapping_, FILE_MAP_READ, 0, 0); +#else + addr_ = ::MapViewOfFile(hMapping_, FILE_MAP_READ, 0, 0, 0); +#endif + + if (addr_ == nullptr) { + close(); + return false; + } #else fd_ = ::open(path, O_RDONLY); if (fd_ == -1) { return false; } @@ -2723,22 +3072,26 @@ inline bool mmap::open(const char *path) { size_ = static_cast(sb.st_size); addr_ = ::mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0); -#endif - if (addr_ == nullptr) { + // Special treatment for an empty file... + if (addr_ == MAP_FAILED && size_ == 0) { close(); + is_open_empty_file = true; return false; } +#endif return true; } -inline bool mmap::is_open() const { return addr_ != nullptr; } +inline bool mmap::is_open() const { + return is_open_empty_file ? true : addr_ != nullptr; +} inline size_t mmap::size() const { return size_; } inline const char *mmap::data() const { - return static_cast(addr_); + return is_open_empty_file ? "" : static_cast(addr_); } inline void mmap::close() { @@ -2757,6 +3110,8 @@ inline void mmap::close() { ::CloseHandle(hFile_); hFile_ = INVALID_HANDLE_VALUE; } + + is_open_empty_file = false; #else if (addr_ != nullptr) { munmap(addr_, size_); @@ -2782,7 +3137,10 @@ template inline ssize_t handle_EINTR(T fn) { ssize_t res = 0; while (true) { res = fn(); - if (res < 0 && errno == EINTR) { continue; } + if (res < 0 && errno == EINTR) { + std::this_thread::sleep_for(std::chrono::microseconds{1}); + continue; + } break; } return res; @@ -2991,23 +3349,37 @@ private: }; #endif -inline bool keep_alive(socket_t sock, time_t keep_alive_timeout_sec) { +inline bool keep_alive(const std::atomic &svr_sock, socket_t sock, + time_t keep_alive_timeout_sec) { using namespace std::chrono; - auto start = steady_clock::now(); + + const auto interval_usec = + CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND; + + // Avoid expensive `steady_clock::now()` call for the first time + if (select_read(sock, 0, interval_usec) > 0) { return true; } + + const auto start = steady_clock::now() - microseconds{interval_usec}; + const auto timeout = seconds{keep_alive_timeout_sec}; + while (true) { - auto val = select_read(sock, 0, 10000); + if (svr_sock == INVALID_SOCKET) { + break; // Server socket is closed + } + + auto val = select_read(sock, 0, interval_usec); if (val < 0) { - return false; + break; // Ssocket error } else if (val == 0) { - auto current = steady_clock::now(); - auto duration = duration_cast(current - start); - auto timeout = keep_alive_timeout_sec * 1000; - if (duration.count() > timeout) { return false; } - std::this_thread::sleep_for(std::chrono::milliseconds(1)); + if (steady_clock::now() - start > timeout) { + break; // Timeout + } } else { - return true; + return true; // Ready for read } } + + return false; } template @@ -3018,8 +3390,7 @@ process_server_socket_core(const std::atomic &svr_sock, socket_t sock, assert(keep_alive_max_count > 0); auto ret = false; auto count = keep_alive_max_count; - while (svr_sock != INVALID_SOCKET && count > 0 && - keep_alive(sock, keep_alive_timeout_sec)) { + while (count > 0 && keep_alive(svr_sock, sock, keep_alive_timeout_sec)) { auto close_connection = count == 1; auto connection_closed = false; ret = callback(close_connection, connection_closed); @@ -3063,10 +3434,29 @@ inline int shutdown_socket(socket_t sock) { #endif } +inline std::string escape_abstract_namespace_unix_domain(const std::string &s) { + if (s.size() > 1 && s[0] == '\0') { + auto ret = s; + ret[0] = '@'; + return ret; + } + return s; +} + +inline std::string +unescape_abstract_namespace_unix_domain(const std::string &s) { + if (s.size() > 1 && s[0] == '@') { + auto ret = s; + ret[0] = '\0'; + return ret; + } + return s; +} + template socket_t create_socket(const std::string &host, const std::string &ip, int port, int address_family, int socket_flags, bool tcp_nodelay, - SocketOptions socket_options, + bool ipv6_v6only, SocketOptions socket_options, BindOrConnect bind_or_connect) { // Get address info const char *node = nullptr; @@ -3075,7 +3465,7 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = 0; + hints.ai_protocol = IPPROTO_IP; if (!ip.empty()) { node = ip.c_str(); @@ -3093,20 +3483,32 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, const auto addrlen = host.length(); if (addrlen > sizeof(sockaddr_un::sun_path)) { return INVALID_SOCKET; } +#ifdef SOCK_CLOEXEC + auto sock = socket(hints.ai_family, hints.ai_socktype | SOCK_CLOEXEC, + hints.ai_protocol); +#else auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol); +#endif + if (sock != INVALID_SOCKET) { sockaddr_un addr{}; addr.sun_family = AF_UNIX; - std::copy(host.begin(), host.end(), addr.sun_path); + + auto unescaped_host = unescape_abstract_namespace_unix_domain(host); + std::copy(unescaped_host.begin(), unescaped_host.end(), addr.sun_path); hints.ai_addr = reinterpret_cast(&addr); hints.ai_addrlen = static_cast( sizeof(addr) - sizeof(addr.sun_path) + addrlen); +#ifndef SOCK_CLOEXEC fcntl(sock, F_SETFD, FD_CLOEXEC); +#endif + if (socket_options) { socket_options(sock); } - if (!bind_or_connect(sock, hints)) { + bool dummy; + if (!bind_or_connect(sock, hints, dummy)) { close_socket(sock); sock = INVALID_SOCKET; } @@ -3123,6 +3525,7 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, #endif return INVALID_SOCKET; } + auto se = detail::scope_exit([&] { freeaddrinfo(result); }); for (auto rp = result; rp; rp = rp->ai_next) { // Create a socket @@ -3148,11 +3551,18 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); } #else + +#ifdef SOCK_CLOEXEC + auto sock = + socket(rp->ai_family, rp->ai_socktype | SOCK_CLOEXEC, rp->ai_protocol); +#else auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); +#endif + #endif if (sock == INVALID_SOCKET) { continue; } -#ifndef _WIN32 +#if !defined _WIN32 && !defined SOCK_CLOEXEC if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) { close_socket(sock); continue; @@ -3160,39 +3570,38 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, #endif if (tcp_nodelay) { - auto yes = 1; + auto opt = 1; #ifdef _WIN32 setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - reinterpret_cast(&yes), sizeof(yes)); + reinterpret_cast(&opt), sizeof(opt)); #else setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - reinterpret_cast(&yes), sizeof(yes)); + reinterpret_cast(&opt), sizeof(opt)); +#endif + } + + if (rp->ai_family == AF_INET6) { + auto opt = ipv6_v6only ? 1 : 0; +#ifdef _WIN32 + setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + reinterpret_cast(&opt), sizeof(opt)); +#else + setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + reinterpret_cast(&opt), sizeof(opt)); #endif } if (socket_options) { socket_options(sock); } - if (rp->ai_family == AF_INET6) { - auto no = 0; -#ifdef _WIN32 - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - reinterpret_cast(&no), sizeof(no)); -#else - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - reinterpret_cast(&no), sizeof(no)); -#endif - } - // bind or connect - if (bind_or_connect(sock, *rp)) { - freeaddrinfo(result); - return sock; - } + auto quit = false; + if (bind_or_connect(sock, *rp, quit)) { return sock; } close_socket(sock); + + if (quit) { break; } } - freeaddrinfo(result); return INVALID_SOCKET; } @@ -3225,6 +3634,7 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { hints.ai_protocol = 0; if (getaddrinfo(host.c_str(), "0", &hints, &result)) { return false; } + auto se = detail::scope_exit([&] { freeaddrinfo(result); }); auto ret = false; for (auto rp = result; rp; rp = rp->ai_next) { @@ -3235,7 +3645,6 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { } } - freeaddrinfo(result); return ret; } @@ -3247,6 +3656,8 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { inline std::string if2ip(int address_family, const std::string &ifn) { struct ifaddrs *ifap; getifaddrs(&ifap); + auto se = detail::scope_exit([&] { freeifaddrs(ifap); }); + std::string addr_candidate; for (auto ifa = ifap; ifa; ifa = ifa->ifa_next) { if (ifa->ifa_addr && ifn == ifa->ifa_name && @@ -3256,7 +3667,6 @@ inline std::string if2ip(int address_family, const std::string &ifn) { auto sa = reinterpret_cast(ifa->ifa_addr); char buf[INET_ADDRSTRLEN]; if (inet_ntop(AF_INET, &sa->sin_addr, buf, INET_ADDRSTRLEN)) { - freeifaddrs(ifap); return std::string(buf, INET_ADDRSTRLEN); } } else if (ifa->ifa_addr->sa_family == AF_INET6) { @@ -3269,7 +3679,6 @@ inline std::string if2ip(int address_family, const std::string &ifn) { if (s6_addr_head == 0xfc || s6_addr_head == 0xfd) { addr_candidate = std::string(buf, INET6_ADDRSTRLEN); } else { - freeifaddrs(ifap); return std::string(buf, INET6_ADDRSTRLEN); } } @@ -3277,20 +3686,21 @@ inline std::string if2ip(int address_family, const std::string &ifn) { } } } - freeifaddrs(ifap); return addr_candidate; } #endif inline socket_t create_client_socket( const std::string &host, const std::string &ip, int port, - int address_family, bool tcp_nodelay, SocketOptions socket_options, - time_t connection_timeout_sec, time_t connection_timeout_usec, - time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, + int address_family, bool tcp_nodelay, bool ipv6_v6only, + SocketOptions socket_options, time_t connection_timeout_sec, + time_t connection_timeout_usec, time_t read_timeout_sec, + time_t read_timeout_usec, time_t write_timeout_sec, time_t write_timeout_usec, const std::string &intf, Error &error) { auto sock = create_socket( - host, ip, port, address_family, 0, tcp_nodelay, std::move(socket_options), - [&](socket_t sock2, struct addrinfo &ai) -> bool { + host, ip, port, address_family, 0, tcp_nodelay, ipv6_v6only, + std::move(socket_options), + [&](socket_t sock2, struct addrinfo &ai, bool &quit) -> bool { if (!intf.empty()) { #ifdef USE_IF2IP auto ip_from_if = if2ip(address_family, intf); @@ -3314,7 +3724,10 @@ inline socket_t create_client_socket( } error = wait_until_socket_is_ready(sock2, connection_timeout_sec, connection_timeout_usec); - if (error != Error::Success) { return false; } + if (error != Error::Success) { + if (error == Error::ConnectionTimeout) { quit = true; } + return false; + } } set_nonblocking(sock2, false); @@ -3439,7 +3852,7 @@ inline unsigned int str2tag(const std::string &s) { namespace udl { -inline constexpr unsigned int operator"" _t(const char *s, size_t l) { +inline constexpr unsigned int operator""_t(const char *s, size_t l) { return str2tag_core(s, l, 0); } @@ -3524,8 +3937,9 @@ inline bool can_compress_content_type(const std::string &content_type) { case "application/protobuf"_t: case "application/xhtml+xml"_t: return true; - default: - return !content_type.rfind("text/", 0) && tag != "text/event-stream"_t; + case "text/event-stream"_t: return false; + + default: return !content_type.rfind("text/", 0); } } @@ -3762,8 +4176,8 @@ inline bool has_header(const Headers &headers, const std::string &key) { } inline const char *get_header_value(const Headers &headers, - const std::string &key, size_t id, - const char *def) { + const std::string &key, const char *def, + size_t id) { auto rng = headers.equal_range(key); auto it = rng.first; std::advance(it, static_cast(id)); @@ -3771,14 +4185,6 @@ inline const char *get_header_value(const Headers &headers, return def; } -inline bool compare_case_ignore(const std::string &a, const std::string &b) { - if (a.size() != b.size()) { return false; } - for (size_t i = 0; i < b.size(); i++) { - if (::tolower(a[i]) != ::tolower(b[i])) { return false; } - } - return true; -} - template inline bool parse_header(const char *beg, const char *end, T fn) { // Skip trailing spaces and tabs. @@ -3801,15 +4207,27 @@ inline bool parse_header(const char *beg, const char *end, T fn) { p++; } - if (p < end) { + if (p <= end) { auto key_len = key_end - beg; if (!key_len) { return false; } auto key = std::string(beg, key_end); - auto val = compare_case_ignore(key, "Location") + auto val = case_ignore::equal(key, "Location") ? std::string(p, end) : decode_url(std::string(p, end), false); - fn(std::move(key), std::move(val)); + + // NOTE: From RFC 9110: + // Field values containing CR, LF, or NUL characters are + // invalid and dangerous, due to the varying ways that + // implementations might parse and interpret those + // characters; a recipient of CR, LF, or NUL within a field + // value MUST either reject the message or replace each of + // those characters with SP before further processing or + // forwarding of that message. + static const std::string CR_LF_NUL("\r\n\0", 3); + if (val.find_first_of(CR_LF_NUL) != std::string::npos) { return false; } + + fn(key, val); return true; } @@ -3829,27 +4247,27 @@ inline bool read_headers(Stream &strm, Headers &headers) { if (line_reader.end_with_crlf()) { // Blank line indicates end of headers. if (line_reader.size() == 2) { break; } -#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR } else { +#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR // Blank line indicates end of headers. if (line_reader.size() == 1) { break; } line_terminator_len = 1; - } #else - } else { continue; // Skip invalid line. - } #endif + } if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } // Exclude line terminator auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; - parse_header(line_reader.ptr(), end, - [&](std::string &&key, std::string &&val) { - headers.emplace(std::move(key), std::move(val)); - }); + if (!parse_header(line_reader.ptr(), end, + [&](const std::string &key, std::string &val) { + headers.emplace(key, val); + })) { + return false; + } } return true; @@ -3937,8 +4355,19 @@ inline bool read_content_chunked(Stream &strm, T &x, assert(chunk_len == 0); - // Trailer - if (!line_reader.getline()) { return false; } + // NOTE: In RFC 9112, '7.1 Chunked Transfer Coding' mentiones "The chunked + // transfer coding is complete when a chunk with a chunk-size of zero is + // received, possibly followed by a trailer section, and finally terminated by + // an empty line". https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1 + // + // In '7.1.3. Decoding Chunked', however, the pseudo-code in the section + // does't care for the existence of the final CRLF. In other words, it seems + // to be ok whether the final CRLF exists or not in the chunked data. + // https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1.3 + // + // According to the reference code in RFC 9112, cpp-htpplib now allows + // chuncked transfer coding data without the final CRLF. + if (!line_reader.getline()) { return true; } while (strcmp(line_reader.ptr(), "\r\n") != 0) { if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } @@ -3948,8 +4377,8 @@ inline bool read_content_chunked(Stream &strm, T &x, auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; parse_header(line_reader.ptr(), end, - [&](std::string &&key, std::string &&val) { - x.headers.emplace(std::move(key), std::move(val)); + [&](const std::string &key, const std::string &val) { + x.headers.emplace(key, val); }); if (!line_reader.getline()) { return false; } @@ -3959,8 +4388,8 @@ inline bool read_content_chunked(Stream &strm, T &x, } inline bool is_chunked_transfer_encoding(const Headers &headers) { - return compare_case_ignore( - get_header_value(headers, "Transfer-Encoding", 0, ""), "chunked"); + return case_ignore::equal( + get_header_value(headers, "Transfer-Encoding", "", 0), "chunked"); } template @@ -4026,8 +4455,14 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, } else if (!has_header(x.headers, "Content-Length")) { ret = read_content_without_length(strm, out); } else { - auto len = get_header_value_u64(x.headers, "Content-Length", 0, 0); - if (len > payload_max_length) { + auto is_invalid_value = false; + auto len = get_header_value_u64(x.headers, "Content-Length", + std::numeric_limits::max(), + 0, is_invalid_value); + + if (is_invalid_value) { + ret = false; + } else if (len > payload_max_length) { exceed_payload_max_length = true; skip_content_with_length(strm, len); ret = false; @@ -4042,13 +4477,36 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, } return ret; }); -} // namespace detail +} + +inline ssize_t write_request_line(Stream &strm, const std::string &method, + const std::string &path) { + std::string s = method; + s += " "; + s += path; + s += " HTTP/1.1\r\n"; + return strm.write(s.data(), s.size()); +} + +inline ssize_t write_response_line(Stream &strm, int status) { + std::string s = "HTTP/1.1 "; + s += std::to_string(status); + s += " "; + s += httplib::status_message(status); + s += "\r\n"; + return strm.write(s.data(), s.size()); +} inline ssize_t write_headers(Stream &strm, const Headers &headers) { ssize_t write_len = 0; for (const auto &x : headers) { - auto len = - strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str()); + std::string s; + s = x.first; + s += ": "; + s += x.second; + s += "\r\n"; + + auto len = strm.write(s.data(), s.size()); if (len < 0) { return len; } write_len += len; } @@ -4302,22 +4760,22 @@ inline std::string params_to_query_str(const Params ¶ms) { return query; } -inline void parse_query_text(const std::string &s, Params ¶ms) { +inline void parse_query_text(const char *data, std::size_t size, + Params ¶ms) { std::set cache; - split(s.data(), s.data() + s.size(), '&', [&](const char *b, const char *e) { + split(data, data + size, '&', [&](const char *b, const char *e) { std::string kv(b, e); if (cache.find(kv) != cache.end()) { return; } - cache.insert(kv); + cache.insert(std::move(kv)); std::string key; std::string val; - split(b, e, '=', [&](const char *b2, const char *e2) { - if (key.empty()) { - key.assign(b2, e2); - } else { - val.assign(b2, e2); - } - }); + divide(b, static_cast(e - b), '=', + [&](const char *lhs_data, std::size_t lhs_size, const char *rhs_data, + std::size_t rhs_size) { + key.assign(lhs_data, lhs_size); + val.assign(rhs_data, rhs_size); + }); if (!key.empty()) { params.emplace(decode_url(key, true), decode_url(val, true)); @@ -4325,6 +4783,10 @@ inline void parse_query_text(const std::string &s, Params ¶ms) { }); } +inline void parse_query_text(const std::string &s, Params ¶ms) { + parse_query_text(s.data(), s.size(), params); +} + inline bool parse_multipart_boundary(const std::string &content_type, std::string &boundary) { auto boundary_keyword = "boundary="; @@ -4365,35 +4827,44 @@ inline bool parse_range_header(const std::string &s, Ranges &ranges) { #else inline bool parse_range_header(const std::string &s, Ranges &ranges) try { #endif - static auto re_first_range = std::regex(R"(bytes=(\d*-\d*(?:,\s*\d*-\d*)*))"); - std::smatch m; - if (std::regex_match(s, m, re_first_range)) { - auto pos = static_cast(m.position(1)); - auto len = static_cast(m.length(1)); + auto is_valid = [](const std::string &str) { + return std::all_of(str.cbegin(), str.cend(), + [](unsigned char c) { return std::isdigit(c); }); + }; + + if (s.size() > 7 && s.compare(0, 6, "bytes=") == 0) { + const auto pos = static_cast(6); + const auto len = static_cast(s.size() - 6); auto all_valid_ranges = true; split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) { if (!all_valid_ranges) { return; } - static auto re_another_range = std::regex(R"(\s*(\d*)-(\d*))"); - std::cmatch cm; - if (std::regex_match(b, e, cm, re_another_range)) { - ssize_t first = -1; - if (!cm.str(1).empty()) { - first = static_cast(std::stoll(cm.str(1))); - } - ssize_t last = -1; - if (!cm.str(2).empty()) { - last = static_cast(std::stoll(cm.str(2))); - } - - if (first != -1 && last != -1 && first > last) { - all_valid_ranges = false; - return; - } - ranges.emplace_back(std::make_pair(first, last)); + const auto it = std::find(b, e, '-'); + if (it == e) { + all_valid_ranges = false; + return; } + + const auto lhs = std::string(b, it); + const auto rhs = std::string(it + 1, e); + if (!is_valid(lhs) || !is_valid(rhs)) { + all_valid_ranges = false; + return; + } + + const auto first = + static_cast(lhs.empty() ? -1 : std::stoll(lhs)); + const auto last = + static_cast(rhs.empty() ? -1 : std::stoll(rhs)); + if ((first == -1 && last == -1) || + (first != -1 && last != -1 && first > last)) { + all_valid_ranges = false; + return; + } + + ranges.emplace_back(first, last); }); - return all_valid_ranges; + return all_valid_ranges && !ranges.empty(); } return false; #ifdef CPPHTTPLIB_NO_EXCEPTIONS @@ -4452,7 +4923,7 @@ public: const auto header = buf_head(pos); if (!parse_header(header.data(), header.data() + header.size(), - [&](std::string &&, std::string &&) {})) { + [&](const std::string &, const std::string &) {})) { is_valid_ = false; return false; } @@ -4562,7 +5033,9 @@ private: const std::string &b) const { if (a.size() < b.size()) { return false; } for (size_t i = 0; i < b.size(); i++) { - if (::tolower(a[i]) != ::tolower(b[i])) { return false; } + if (case_ignore::to_lower(a[i]) != case_ignore::to_lower(b[i])) { + return false; + } } return true; } @@ -4645,16 +5118,6 @@ private: size_t buf_epos_ = 0; }; -inline std::string to_lower(const char *beg, const char *end) { - std::string out; - auto it = beg; - while (it != end) { - out += static_cast(::tolower(*it)); - it++; - } - return out; -} - inline std::string random_string(size_t length) { static const char data[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; @@ -4768,7 +5231,18 @@ inline bool range_error(Request &req, Response &res) { last_pos = contant_len - 1; } - if (last_pos == -1) { last_pos = contant_len - 1; } + // NOTE: RFC-9110 '14.1.2. Byte Ranges': + // A client can limit the number of bytes requested without knowing the + // size of the selected representation. If the last-pos value is absent, + // or if the value is greater than or equal to the current length of the + // representation data, the byte range is interpreted as the remainder of + // the representation (i.e., the server replaces the value of last-pos + // with a value that is one less than the current length of the selected + // representation). + // https://www.rfc-editor.org/rfc/rfc9110.html#section-14.1.2-6 + if (last_pos == -1 || last_pos >= contant_len) { + last_pos = contant_len - 1; + } // Range must be within content length if (!(0 <= first_pos && first_pos <= last_pos && @@ -4795,12 +5269,11 @@ inline bool range_error(Request &req, Response &res) { inline std::pair get_range_offset_and_length(Range r, size_t content_length) { - (void)(content_length); // patch to get rid of "unused parameter" on release build assert(r.first != -1 && r.second != -1); assert(0 <= r.first && r.first < static_cast(content_length)); assert(r.first <= r.second && r.second < static_cast(content_length)); - + (void)(content_length); return std::make_pair(r.first, static_cast(r.second - r.first) + 1); } @@ -5230,6 +5703,7 @@ inline void hosted_at(const std::string &hostname, #endif return; } + auto se = detail::scope_exit([&] { freeaddrinfo(result); }); for (auto rp = result; rp; rp = rp->ai_next) { const auto &addr = @@ -5241,8 +5715,6 @@ inline void hosted_at(const std::string &hostname, addrs.push_back(ip); } } - - freeaddrinfo(result); } inline std::string append_query_params(const std::string &path, @@ -5291,8 +5763,8 @@ inline bool Request::has_header(const std::string &key) const { } inline std::string Request::get_header_value(const std::string &key, - size_t id) const { - return detail::get_header_value(headers, key, id, ""); + const char *def, size_t id) const { + return detail::get_header_value(headers, key, def, id); } inline size_t Request::get_header_value_count(const std::string &key) const { @@ -5302,7 +5774,8 @@ inline size_t Request::get_header_value_count(const std::string &key) const { inline void Request::set_header(const std::string &key, const std::string &val) { - if (!detail::has_crlf(key) && !detail::has_crlf(val)) { + if (detail::fields::is_field_name(key) && + detail::fields::is_field_value(val)) { headers.emplace(key, val); } } @@ -5356,8 +5829,9 @@ inline bool Response::has_header(const std::string &key) const { } inline std::string Response::get_header_value(const std::string &key, + const char *def, size_t id) const { - return detail::get_header_value(headers, key, id, ""); + return detail::get_header_value(headers, key, def, id); } inline size_t Response::get_header_value_count(const std::string &key) const { @@ -5367,13 +5841,14 @@ inline size_t Response::get_header_value_count(const std::string &key) const { inline void Response::set_header(const std::string &key, const std::string &val) { - if (!detail::has_crlf(key) && !detail::has_crlf(val)) { + if (detail::fields::is_field_name(key) && + detail::fields::is_field_value(val)) { headers.emplace(key, val); } } inline void Response::set_redirect(const std::string &url, int stat) { - if (!detail::has_crlf(url)) { + if (detail::fields::is_field_value(url)) { set_header("Location", url); if (300 <= stat && stat < 400) { this->status = stat; @@ -5436,14 +5911,25 @@ inline void Response::set_chunked_content_provider( is_chunked_content_provider_ = true; } +inline void Response::set_file_content(const std::string &path, + const std::string &content_type) { + file_content_path_ = path; + file_content_content_type_ = content_type; +} + +inline void Response::set_file_content(const std::string &path) { + file_content_path_ = path; +} + // Result implementation inline bool Result::has_request_header(const std::string &key) const { return request_headers_.find(key) != request_headers_.end(); } inline std::string Result::get_request_header_value(const std::string &key, + const char *def, size_t id) const { - return detail::get_header_value(request_headers_, key, id, ""); + return detail::get_header_value(request_headers_, key, def, id); } inline size_t @@ -5584,6 +6070,8 @@ inline socket_t BufferStream::socket() const { return 0; } inline const std::string &BufferStream::get_buffer() const { return buffer; } inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { + static constexpr char marker[] = "/:"; + // One past the last ending position of a path param substring std::size_t last_param_end = 0; @@ -5596,13 +6084,14 @@ inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { #endif while (true) { - const auto marker_pos = pattern.find(marker, last_param_end); + const auto marker_pos = pattern.find( + marker, last_param_end == 0 ? last_param_end : last_param_end - 1); if (marker_pos == std::string::npos) { break; } static_fragments_.push_back( - pattern.substr(last_param_end, marker_pos - last_param_end)); + pattern.substr(last_param_end, marker_pos - last_param_end + 1)); - const auto param_name_start = marker_pos + 1; + const auto param_name_start = marker_pos + 2; auto sep_pos = pattern.find(separator, param_name_start); if (sep_pos == std::string::npos) { sep_pos = pattern.length(); } @@ -5664,7 +6153,7 @@ inline bool PathParamsMatcher::match(Request &request) const { request.path_params.emplace( param_name, request.path.substr(starting_pos, sep_pos - starting_pos)); - // Mark everythin up to '/' as matched + // Mark everything up to '/' as matched starting_pos = sep_pos + 1; } // Returns false if the path is longer than the pattern @@ -5763,7 +6252,8 @@ inline bool Server::set_base_dir(const std::string &dir, inline bool Server::set_mount_point(const std::string &mount_point, const std::string &dir, Headers headers) { - if (detail::is_dir(dir)) { + detail::FileStat stat(dir); + if (stat.is_dir()) { std::string mnt = !mount_point.empty() ? mount_point : "/"; if (!mnt.empty() && mnt[0] == '/') { base_dirs_.push_back({mnt, dir, std::move(headers)}); @@ -5800,12 +6290,14 @@ inline Server &Server::set_file_request_handler(Handler handler) { return *this; } -inline Server &Server::set_error_handler(HandlerWithResponse handler) { +inline Server &Server::set_error_handler_core(HandlerWithResponse handler, + std::true_type) { error_handler_ = std::move(handler); return *this; } -inline Server &Server::set_error_handler(Handler handler) { +inline Server &Server::set_error_handler_core(Handler handler, + std::false_type) { error_handler_ = [handler](const Request &req, Response &res) { handler(req, res); return HandlerResponse::Handled; @@ -5849,6 +6341,11 @@ inline Server &Server::set_tcp_nodelay(bool on) { return *this; } +inline Server &Server::set_ipv6_v6only(bool on) { + ipv6_v6only_ = on; + return *this; +} + inline Server &Server::set_socket_options(SocketOptions socket_options) { socket_options_ = std::move(socket_options); return *this; @@ -5900,27 +6397,27 @@ inline Server &Server::set_payload_max_length(size_t length) { inline bool Server::bind_to_port(const std::string &host, int port, int socket_flags) { - return bind_internal(host, port, socket_flags) >= 0; + auto ret = bind_internal(host, port, socket_flags); + if (ret == -1) { is_decommisioned = true; } + return ret >= 0; } inline int Server::bind_to_any_port(const std::string &host, int socket_flags) { - return bind_internal(host, 0, socket_flags); + auto ret = bind_internal(host, 0, socket_flags); + if (ret == -1) { is_decommisioned = true; } + return ret; } -inline bool Server::listen_after_bind() { - auto se = detail::scope_exit([&]() { done_ = true; }); - return listen_internal(); -} +inline bool Server::listen_after_bind() { return listen_internal(); } inline bool Server::listen(const std::string &host, int port, int socket_flags) { - auto se = detail::scope_exit([&]() { done_ = true; }); return bind_to_port(host, port, socket_flags) && listen_internal(); } inline bool Server::is_running() const { return is_running_; } inline void Server::wait_until_ready() const { - while (!is_running() && !done_) { + while (!is_running_ && !is_decommisioned) { std::this_thread::sleep_for(std::chrono::milliseconds{1}); } } @@ -5932,8 +6429,11 @@ inline void Server::stop() { detail::shutdown_socket(sock); detail::close_socket(sock); } + is_decommisioned = false; } +inline void Server::decommission() { is_decommisioned = true; } + inline bool Server::parse_request_line(const char *s, Request &req) const { auto len = strlen(s); if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; } @@ -5972,26 +6472,13 @@ inline bool Server::parse_request_line(const char *s, Request &req) const { } } - size_t count = 0; - - detail::split(req.target.data(), req.target.data() + req.target.size(), '?', - 2, [&](const char *b, const char *e) { - switch (count) { - case 0: - req.path = detail::decode_url(std::string(b, e), false); - break; - case 1: { - if (e - b > 0) { - detail::parse_query_text(std::string(b, e), req.params); - } - break; - } - default: break; - } - count++; - }); - - if (count > 2) { return false; } + detail::divide(req.target, '?', + [&](const char *lhs_data, std::size_t lhs_size, + const char *rhs_data, std::size_t rhs_size) { + req.path = detail::decode_url( + std::string(lhs_data, lhs_size), false); + detail::parse_query_text(rhs_data, rhs_size, req.params); + }); } return true; @@ -6030,23 +6517,24 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection, if (close_connection || req.get_header_value("Connection") == "close") { res.set_header("Connection", "close"); } else { - std::stringstream ss; - ss << "timeout=" << keep_alive_timeout_sec_ - << ", max=" << keep_alive_max_count_; - res.set_header("Keep-Alive", ss.str()); + std::string s = "timeout="; + s += std::to_string(keep_alive_timeout_sec_); + s += ", max="; + s += std::to_string(keep_alive_max_count_); + res.set_header("Keep-Alive", s); } - if (!res.has_header("Content-Type") && - (!res.body.empty() || res.content_length_ > 0 || res.content_provider_)) { + if ((!res.body.empty() || res.content_length_ > 0 || res.content_provider_) && + !res.has_header("Content-Type")) { res.set_header("Content-Type", "text/plain"); } - if (!res.has_header("Content-Length") && res.body.empty() && - !res.content_length_ && !res.content_provider_) { + if (res.body.empty() && !res.content_length_ && !res.content_provider_ && + !res.has_header("Content-Length")) { res.set_header("Content-Length", "0"); } - if (!res.has_header("Accept-Ranges") && req.method == "HEAD") { + if (req.method == "HEAD" && !res.has_header("Accept-Ranges")) { res.set_header("Accept-Ranges", "bytes"); } @@ -6055,12 +6543,7 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection, // Response line and headers { detail::BufferStream bstrm; - - if (!bstrm.write_format("HTTP/1.1 %d %s\r\n", res.status, - status_message(res.status))) { - return false; - } - + if (!detail::write_response_line(bstrm, res.status)) { return false; } if (!header_writer_(bstrm, res.headers)) { return false; } // Flush buffer @@ -6254,7 +6737,14 @@ inline bool Server::handle_file_request(const Request &req, Response &res, auto path = entry.base_dir + sub_path; if (path.back() == '/') { path += "index.html"; } - if (detail::is_file(path)) { + detail::FileStat stat(path); + + if (stat.is_dir()) { + res.set_redirect(sub_path + "/", StatusCode::MovedPermanently_301); + return true; + } + + if (stat.is_file()) { for (const auto &kv : entry.headers) { res.set_header(kv.first, kv.second); } @@ -6289,8 +6779,8 @@ Server::create_server_socket(const std::string &host, int port, SocketOptions socket_options) const { return detail::create_socket( host, std::string(), port, address_family_, socket_flags, tcp_nodelay_, - std::move(socket_options), - [](socket_t sock, struct addrinfo &ai) -> bool { + ipv6_v6only_, std::move(socket_options), + [](socket_t sock, struct addrinfo &ai, bool & /*quit*/) -> bool { if (::bind(sock, ai.ai_addr, static_cast(ai.ai_addrlen))) { return false; } @@ -6301,6 +6791,8 @@ Server::create_server_socket(const std::string &host, int port, inline int Server::bind_internal(const std::string &host, int port, int socket_flags) { + if (is_decommisioned) { return -1; } + if (!is_valid()) { return -1; } svr_sock_ = create_server_socket(host, port, socket_flags, socket_options_); @@ -6326,6 +6818,8 @@ inline int Server::bind_internal(const std::string &host, int port, } inline bool Server::listen_internal() { + if (is_decommisioned) { return false; } + auto ret = true; is_running_ = true; auto se = detail::scope_exit([&]() { is_running_ = false; }); @@ -6346,13 +6840,22 @@ inline bool Server::listen_internal() { #ifndef _WIN32 } #endif + +#if defined _WIN32 + // sockets conneced via WASAccept inherit flags NO_HANDLE_INHERIT, + // OVERLAPPED + socket_t sock = WSAAccept(svr_sock_, nullptr, nullptr, nullptr, 0); +#elif defined SOCK_CLOEXEC + socket_t sock = accept4(svr_sock_, nullptr, nullptr, SOCK_CLOEXEC); +#else socket_t sock = accept(svr_sock_, nullptr, nullptr); +#endif if (sock == INVALID_SOCKET) { if (errno == EMFILE) { // The per-process limit of open file descriptors has been reached. // Try to accept new connections after a short sleep. - std::this_thread::sleep_for(std::chrono::milliseconds(1)); + std::this_thread::sleep_for(std::chrono::microseconds{1}); continue; } else if (errno == EINTR || errno == EAGAIN) { continue; @@ -6406,6 +6909,7 @@ inline bool Server::listen_internal() { task_queue->shutdown(); } + is_decommisioned = !ret; return ret; } @@ -6503,7 +7007,7 @@ inline bool Server::dispatch_request(Request &req, Response &res, inline void Server::apply_ranges(const Request &req, Response &res, std::string &content_type, std::string &boundary) const { - if (req.ranges.size() > 1) { + if (req.ranges.size() > 1 && res.status == StatusCode::PartialContent_206) { auto it = res.headers.find("Content-Type"); if (it != res.headers.end()) { content_type = it->second; @@ -6521,7 +7025,7 @@ inline void Server::apply_ranges(const Request &req, Response &res, if (res.body.empty()) { if (res.content_length_ > 0) { size_t length = 0; - if (req.ranges.empty()) { + if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) { length = res.content_length_; } else if (req.ranges.size() == 1) { auto offset_and_length = detail::get_range_offset_and_length( @@ -6550,7 +7054,7 @@ inline void Server::apply_ranges(const Request &req, Response &res, } } } else { - if (req.ranges.empty()) { + if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) { ; } else if (req.ranges.size() == 1) { auto offset_and_length = @@ -6621,7 +7125,9 @@ inline bool Server::dispatch_request_for_content_reader( } inline bool -Server::process_request(Stream &strm, bool close_connection, +Server::process_request(Stream &strm, const std::string &remote_addr, + int remote_port, const std::string &local_addr, + int local_port, bool close_connection, bool &connection_closed, const std::function &setup_request) { std::array buf{}; @@ -6675,11 +7181,13 @@ Server::process_request(Stream &strm, bool close_connection, connection_closed = true; } - strm.get_remote_ip_and_port(req.remote_addr, req.remote_port); + req.remote_addr = remote_addr; + req.remote_port = remote_port; req.set_header("REMOTE_ADDR", req.remote_addr); req.set_header("REMOTE_PORT", std::to_string(req.remote_port)); - strm.get_local_ip_and_port(req.local_addr, req.local_port); + req.local_addr = local_addr; + req.local_port = local_port; req.set_header("LOCAL_ADDR", req.local_addr); req.set_header("LOCAL_PORT", std::to_string(req.local_port)); @@ -6701,13 +7209,20 @@ Server::process_request(Stream &strm, bool close_connection, switch (status) { case StatusCode::Continue_100: case StatusCode::ExpectationFailed_417: - strm.write_format("HTTP/1.1 %d %s\r\n\r\n", status, - status_message(status)); + detail::write_response_line(strm, status); + strm.write("\r\n"); break; - default: return write_response(strm, close_connection, req, res); + default: + connection_closed = true; + return write_response(strm, true, req, res); } } + // Setup `is_connection_closed` method + req.is_connection_closed = [&]() { + return !detail::is_socket_alive(strm.socket()); + }; + // Routing auto routed = false; #ifdef CPPHTTPLIB_NO_EXCEPTIONS @@ -6750,6 +7265,32 @@ Server::process_request(Stream &strm, bool close_connection, : StatusCode::PartialContent_206; } + // Serve file content by using a content provider + if (!res.file_content_path_.empty()) { + const auto &path = res.file_content_path_; + auto mm = std::make_shared(path.c_str()); + if (!mm->is_open()) { + res.body.clear(); + res.content_length_ = 0; + res.content_provider_ = nullptr; + res.status = StatusCode::NotFound_404; + return write_response(strm, close_connection, req, res); + } + + auto content_type = res.file_content_content_type_; + if (content_type.empty()) { + content_type = detail::find_content_type( + path, file_extension_and_mimetype_map_, default_file_mimetype_); + } + + res.set_content_provider( + mm->size(), content_type, + [mm](size_t offset, size_t length, DataSink &sink) -> bool { + sink.write(mm->data() + offset, length); + return true; + }); + } + if (detail::range_error(req, res)) { res.body.clear(); res.content_length_ = 0; @@ -6769,12 +7310,21 @@ Server::process_request(Stream &strm, bool close_connection, inline bool Server::is_valid() const { return true; } inline bool Server::process_and_close_socket(socket_t sock) { + std::string remote_addr; + int remote_port = 0; + detail::get_remote_ip_and_port(sock, remote_addr, remote_port); + + std::string local_addr; + int local_port = 0; + detail::get_local_ip_and_port(sock, local_addr, local_port); + auto ret = detail::process_server_socket( svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, - [this](Stream &strm, bool close_connection, bool &connection_closed) { - return process_request(strm, close_connection, connection_closed, + [&](Stream &strm, bool close_connection, bool &connection_closed) { + return process_request(strm, remote_addr, remote_port, local_addr, + local_port, close_connection, connection_closed, nullptr); }); @@ -6793,8 +7343,8 @@ inline ClientImpl::ClientImpl(const std::string &host, int port) inline ClientImpl::ClientImpl(const std::string &host, int port, const std::string &client_cert_path, const std::string &client_key_path) - : host_(host), port_(port), - host_and_port_(adjust_host_string(host) + ":" + std::to_string(port)), + : host_(detail::escape_abstract_namespace_unix_domain(host)), port_(port), + host_and_port_(adjust_host_string(host_) + ":" + std::to_string(port)), client_cert_path_(client_cert_path), client_key_path_(client_key_path) {} inline ClientImpl::~ClientImpl() { @@ -6825,6 +7375,7 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) { url_encode_ = rhs.url_encode_; address_family_ = rhs.address_family_; tcp_nodelay_ = rhs.tcp_nodelay_; + ipv6_v6only_ = rhs.ipv6_v6only_; socket_options_ = rhs.socket_options_; compress_ = rhs.compress_; decompress_ = rhs.decompress_; @@ -6845,6 +7396,8 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) { #endif #ifdef CPPHTTPLIB_OPENSSL_SUPPORT server_certificate_verification_ = rhs.server_certificate_verification_; + server_hostname_verification_ = rhs.server_hostname_verification_; + server_certificate_verifier_ = rhs.server_certificate_verifier_; #endif logger_ = rhs.logger_; } @@ -6853,9 +7406,9 @@ inline socket_t ClientImpl::create_client_socket(Error &error) const { if (!proxy_host_.empty() && proxy_port_ != -1) { return detail::create_client_socket( proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_, - socket_options_, connection_timeout_sec_, connection_timeout_usec_, - read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, interface_, error); + ipv6_v6only_, socket_options_, connection_timeout_sec_, + connection_timeout_usec_, read_timeout_sec_, read_timeout_usec_, + write_timeout_sec_, write_timeout_usec_, interface_, error); } // Check is custom IP specified for host_ @@ -6864,10 +7417,10 @@ inline socket_t ClientImpl::create_client_socket(Error &error) const { if (it != addr_map_.end()) { ip = it->second; } return detail::create_client_socket( - host_, ip, port_, address_family_, tcp_nodelay_, socket_options_, - connection_timeout_sec_, connection_timeout_usec_, read_timeout_sec_, - read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, interface_, - error); + host_, ip, port_, address_family_, tcp_nodelay_, ipv6_v6only_, + socket_options_, connection_timeout_sec_, connection_timeout_usec_, + read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, + write_timeout_usec_, interface_, error); } inline bool ClientImpl::create_and_connect_socket(Socket &socket, @@ -6956,6 +7509,18 @@ inline bool ClientImpl::send(Request &req, Response &res, Error &error) { return ret; } +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT +inline bool ClientImpl::is_ssl_peer_could_be_closed(SSL *ssl) const { + detail::set_nonblocking(socket_.sock, true); + auto se = detail::scope_exit( + [&]() { detail::set_nonblocking(socket_.sock, false); }); + + char buf[1]; + return !SSL_peek(ssl, buf, 1) && + SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN; +} +#endif + inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { { std::lock_guard guard(socket_mutex_); @@ -6967,6 +7532,13 @@ inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { auto is_alive = false; if (socket_.is_open()) { is_alive = detail::is_socket_alive(socket_.sock); + +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + if (is_alive && is_ssl()) { + if (is_ssl_peer_could_be_closed(socket_.ssl)) { is_alive = false; } + } +#endif + if (!is_alive) { // Attempt to avoid sigpipe by shutting down nongracefully if it seems // like the other side has already closed the connection Also, there @@ -7144,7 +7716,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) { if (location.empty()) { return false; } const static std::regex re( - R"((?:(https?):)?(?://(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)"); + R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)"); std::smatch m; if (!std::regex_match(location, m, re)) { return false; } @@ -7243,12 +7815,26 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, if (!req.has_header("Accept")) { req.set_header("Accept", "*/*"); } -#ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT - if (!req.has_header("User-Agent")) { - auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION; - req.set_header("User-Agent", agent); - } + if (!req.content_receiver) { + if (!req.has_header("Accept-Encoding")) { + std::string accept_encoding; +#ifdef CPPHTTPLIB_BROTLI_SUPPORT + accept_encoding = "br"; #endif +#ifdef CPPHTTPLIB_ZLIB_SUPPORT + if (!accept_encoding.empty()) { accept_encoding += ", "; } + accept_encoding += "gzip, deflate"; +#endif + req.set_header("Accept-Encoding", accept_encoding); + } + +#ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT + if (!req.has_header("User-Agent")) { + auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION; + req.set_header("User-Agent", agent); + } +#endif + }; if (req.body.empty()) { if (req.content_provider_) { @@ -7308,8 +7894,14 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, { detail::BufferStream bstrm; - const auto &path = url_encode_ ? detail::encode_url(req.path) : req.path; - bstrm.write_format("%s %s HTTP/1.1\r\n", req.method.c_str(), path.c_str()); + const auto &path_with_query = + req.params.empty() ? req.path + : append_query_params(req.path, req.params); + + const auto &path = + url_encode_ ? detail::encode_url(path_with_query) : path_with_query; + + detail::write_request_line(bstrm, req.method, path); header_writer_(bstrm, req.headers); @@ -7417,11 +8009,12 @@ inline Result ClientImpl::send_with_content_provider( const std::string &method, const std::string &path, const Headers &headers, const char *body, size_t content_length, ContentProvider content_provider, ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type) { + const std::string &content_type, Progress progress) { Request req; req.method = method; req.headers = headers; req.path = path; + req.progress = progress; auto error = Error::Success; @@ -7448,9 +8041,7 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, if (is_ssl()) { auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1; if (!is_proxy_enabled) { - char buf[1]; - if (SSL_peek(socket_.ssl, buf, 1) == 0 && - SSL_get_error(socket_.ssl, 0) == SSL_ERROR_ZERO_RETURN) { + if (is_ssl_peer_could_be_closed(socket_.ssl)) { error = Error::SSLPeerCouldBeClosed_; return false; } @@ -7468,7 +8059,9 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, // Body if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" && req.method != "CONNECT") { - auto redirect = 300 < res.status && res.status < 400 && follow_location_; + auto redirect = 300 < res.status && res.status < 400 && + res.status != StatusCode::NotModified_304 && + follow_location_; if (req.response_handler && !redirect) { if (!req.response_handler(res)) { @@ -7489,9 +8082,7 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, : static_cast( [&](const char *buf, size_t n, uint64_t /*off*/, uint64_t /*len*/) { - if (res.body.size() + n > res.body.max_size()) { - return false; - } + assert(res.body.size() + n <= res.body.max_size()); res.body.append(buf, n); return true; }); @@ -7503,12 +8094,25 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, return ret; }; - int dummy_status; - if (!detail::read_content(strm, res, (std::numeric_limits::max)(), - dummy_status, std::move(progress), std::move(out), - decompress_)) { - if (error != Error::Canceled) { error = Error::Read; } - return false; + if (res.has_header("Content-Length")) { + if (!req.content_receiver) { + auto len = res.get_header_value_u64("Content-Length"); + if (len > res.body.max_size()) { + error = Error::Read; + return false; + } + res.body.reserve(static_cast(len)); + } + } + + if (res.status != StatusCode::NotModified_304) { + int dummy_status; + if (!detail::read_content(strm, res, (std::numeric_limits::max)(), + dummy_status, std::move(progress), + std::move(out), decompress_)) { + if (error != Error::Canceled) { error = Error::Read; } + return false; + } } } @@ -7717,14 +8321,22 @@ inline Result ClientImpl::Post(const std::string &path, inline Result ClientImpl::Post(const std::string &path, const char *body, size_t content_length, const std::string &content_type) { - return Post(path, Headers(), body, content_length, content_type); + return Post(path, Headers(), body, content_length, content_type, nullptr); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return send_with_content_provider("POST", path, headers, body, content_length, - nullptr, nullptr, content_type); + nullptr, nullptr, content_type, nullptr); +} + +inline Result ClientImpl::Post(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, + Progress progress) { + return send_with_content_provider("POST", path, headers, body, content_length, + nullptr, nullptr, content_type, progress); } inline Result ClientImpl::Post(const std::string &path, const std::string &body, @@ -7732,12 +8344,27 @@ inline Result ClientImpl::Post(const std::string &path, const std::string &body, return Post(path, Headers(), body, content_type); } +inline Result ClientImpl::Post(const std::string &path, const std::string &body, + const std::string &content_type, + Progress progress) { + return Post(path, Headers(), body, content_type, progress); +} + inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return send_with_content_provider("POST", path, headers, body.data(), - body.size(), nullptr, nullptr, - content_type); + body.size(), nullptr, nullptr, content_type, + nullptr); +} + +inline Result ClientImpl::Post(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, + Progress progress) { + return send_with_content_provider("POST", path, headers, body.data(), + body.size(), nullptr, nullptr, content_type, + progress); } inline Result ClientImpl::Post(const std::string &path, const Params ¶ms) { @@ -7763,14 +8390,15 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const std::string &content_type) { return send_with_content_provider("POST", path, headers, nullptr, content_length, std::move(content_provider), - nullptr, content_type); + nullptr, content_type, nullptr); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, ContentProviderWithoutLength content_provider, const std::string &content_type) { return send_with_content_provider("POST", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type); + std::move(content_provider), content_type, + nullptr); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, @@ -7779,6 +8407,13 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers, return Post(path, headers, query, "application/x-www-form-urlencoded"); } +inline Result ClientImpl::Post(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress) { + auto query = detail::params_to_query_str(params); + return Post(path, headers, query, "application/x-www-form-urlencoded", + progress); +} + inline Result ClientImpl::Post(const std::string &path, const MultipartFormDataItems &items) { return Post(path, Headers(), items); @@ -7816,7 +8451,7 @@ ClientImpl::Post(const std::string &path, const Headers &headers, return send_with_content_provider( "POST", path, headers, nullptr, 0, nullptr, get_multipart_content_provider(boundary, items, provider_items), - content_type); + content_type, nullptr); } inline Result ClientImpl::Put(const std::string &path) { @@ -7833,7 +8468,15 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, body, content_length, - nullptr, nullptr, content_type); + nullptr, nullptr, content_type, nullptr); +} + +inline Result ClientImpl::Put(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, + Progress progress) { + return send_with_content_provider("PUT", path, headers, body, content_length, + nullptr, nullptr, content_type, progress); } inline Result ClientImpl::Put(const std::string &path, const std::string &body, @@ -7841,12 +8484,27 @@ inline Result ClientImpl::Put(const std::string &path, const std::string &body, return Put(path, Headers(), body, content_type); } +inline Result ClientImpl::Put(const std::string &path, const std::string &body, + const std::string &content_type, + Progress progress) { + return Put(path, Headers(), body, content_type, progress); +} + inline Result ClientImpl::Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, body.data(), - body.size(), nullptr, nullptr, - content_type); + body.size(), nullptr, nullptr, content_type, + nullptr); +} + +inline Result ClientImpl::Put(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, + Progress progress) { + return send_with_content_provider("PUT", path, headers, body.data(), + body.size(), nullptr, nullptr, content_type, + progress); } inline Result ClientImpl::Put(const std::string &path, size_t content_length, @@ -7868,14 +8526,15 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, nullptr, content_length, std::move(content_provider), - nullptr, content_type); + nullptr, content_type, nullptr); } inline Result ClientImpl::Put(const std::string &path, const Headers &headers, ContentProviderWithoutLength content_provider, const std::string &content_type) { return send_with_content_provider("PUT", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type); + std::move(content_provider), content_type, + nullptr); } inline Result ClientImpl::Put(const std::string &path, const Params ¶ms) { @@ -7888,6 +8547,13 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers, return Put(path, headers, query, "application/x-www-form-urlencoded"); } +inline Result ClientImpl::Put(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress) { + auto query = detail::params_to_query_str(params); + return Put(path, headers, query, "application/x-www-form-urlencoded", + progress); +} + inline Result ClientImpl::Put(const std::string &path, const MultipartFormDataItems &items) { return Put(path, Headers(), items); @@ -7925,7 +8591,7 @@ ClientImpl::Put(const std::string &path, const Headers &headers, return send_with_content_provider( "PUT", path, headers, nullptr, 0, nullptr, get_multipart_content_provider(boundary, items, provider_items), - content_type); + content_type, nullptr); } inline Result ClientImpl::Patch(const std::string &path) { return Patch(path, std::string(), std::string()); @@ -7937,12 +8603,26 @@ inline Result ClientImpl::Patch(const std::string &path, const char *body, return Patch(path, Headers(), body, content_length, content_type); } +inline Result ClientImpl::Patch(const std::string &path, const char *body, + size_t content_length, + const std::string &content_type, + Progress progress) { + return Patch(path, Headers(), body, content_length, content_type, progress); +} + inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { + return Patch(path, headers, body, content_length, content_type, nullptr); +} + +inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, + Progress progress) { return send_with_content_provider("PATCH", path, headers, body, content_length, nullptr, nullptr, - content_type); + content_type, progress); } inline Result ClientImpl::Patch(const std::string &path, @@ -7951,12 +8631,26 @@ inline Result ClientImpl::Patch(const std::string &path, return Patch(path, Headers(), body, content_type); } +inline Result ClientImpl::Patch(const std::string &path, + const std::string &body, + const std::string &content_type, + Progress progress) { + return Patch(path, Headers(), body, content_type, progress); +} + inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { + return Patch(path, headers, body, content_type, nullptr); +} + +inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, + Progress progress) { return send_with_content_provider("PATCH", path, headers, body.data(), - body.size(), nullptr, nullptr, - content_type); + body.size(), nullptr, nullptr, content_type, + progress); } inline Result ClientImpl::Patch(const std::string &path, size_t content_length, @@ -7978,14 +8672,15 @@ inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, const std::string &content_type) { return send_with_content_provider("PATCH", path, headers, nullptr, content_length, std::move(content_provider), - nullptr, content_type); + nullptr, content_type, nullptr); } inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, ContentProviderWithoutLength content_provider, const std::string &content_type) { return send_with_content_provider("PATCH", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type); + std::move(content_provider), content_type, + nullptr); } inline Result ClientImpl::Delete(const std::string &path) { @@ -8003,14 +8698,30 @@ inline Result ClientImpl::Delete(const std::string &path, const char *body, return Delete(path, Headers(), body, content_length, content_type); } +inline Result ClientImpl::Delete(const std::string &path, const char *body, + size_t content_length, + const std::string &content_type, + Progress progress) { + return Delete(path, Headers(), body, content_length, content_type, progress); +} + inline Result ClientImpl::Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { + return Delete(path, headers, body, content_length, content_type, nullptr); +} + +inline Result ClientImpl::Delete(const std::string &path, + const Headers &headers, const char *body, + size_t content_length, + const std::string &content_type, + Progress progress) { Request req; req.method = "DELETE"; req.headers = headers; req.path = path; + req.progress = progress; if (!content_type.empty()) { req.set_header("Content-Type", content_type); } req.body.assign(body, content_length); @@ -8024,6 +8735,14 @@ inline Result ClientImpl::Delete(const std::string &path, return Delete(path, Headers(), body.data(), body.size(), content_type); } +inline Result ClientImpl::Delete(const std::string &path, + const std::string &body, + const std::string &content_type, + Progress progress) { + return Delete(path, Headers(), body.data(), body.size(), content_type, + progress); +} + inline Result ClientImpl::Delete(const std::string &path, const Headers &headers, const std::string &body, @@ -8031,6 +8750,15 @@ inline Result ClientImpl::Delete(const std::string &path, return Delete(path, headers, body.data(), body.size(), content_type); } +inline Result ClientImpl::Delete(const std::string &path, + const Headers &headers, + const std::string &body, + const std::string &content_type, + Progress progress) { + return Delete(path, headers, body.data(), body.size(), content_type, + progress); +} + inline Result ClientImpl::Options(const std::string &path) { return Options(path, Headers()); } @@ -8138,6 +8866,8 @@ inline void ClientImpl::set_address_family(int family) { inline void ClientImpl::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; } +inline void ClientImpl::set_ipv6_v6only(bool on) { ipv6_v6only_ = on; } + inline void ClientImpl::set_socket_options(SocketOptions socket_options) { socket_options_ = std::move(socket_options); } @@ -8187,13 +8917,11 @@ inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) { inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, std::size_t size) const { auto mem = BIO_new_mem_buf(ca_cert, static_cast(size)); + auto se = detail::scope_exit([&] { BIO_free_all(mem); }); if (!mem) { return nullptr; } auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr); - if (!inf) { - BIO_free_all(mem); - return nullptr; - } + if (!inf) { return nullptr; } auto cts = X509_STORE_new(); if (cts) { @@ -8207,13 +8935,21 @@ inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, } sk_X509_INFO_pop_free(inf, X509_INFO_free); - BIO_free_all(mem); return cts; } inline void ClientImpl::enable_server_certificate_verification(bool enabled) { server_certificate_verification_ = enabled; } + +inline void ClientImpl::enable_server_hostname_verification(bool enabled) { + server_hostname_verification_ = enabled; +} + +inline void ClientImpl::set_server_certificate_verifier( + std::function verifier) { + server_certificate_verifier_ = verifier; +} #endif inline void ClientImpl::set_logger(Logger logger) { @@ -8257,13 +8993,30 @@ inline SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex, return ssl; } -inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, +inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock, bool shutdown_gracefully) { // sometimes we may want to skip this to try to avoid SIGPIPE if we know // the remote has closed the network connection // Note that it is not always possible to avoid SIGPIPE, this is merely a // best-efforts. - if (shutdown_gracefully) { SSL_shutdown(ssl); } + if (shutdown_gracefully) { +#ifdef _WIN32 + (void)(sock); + SSL_shutdown(ssl); +#else + timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + reinterpret_cast(&tv), sizeof(tv)); + + auto ret = SSL_shutdown(ssl); + while (ret == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + ret = SSL_shutdown(ssl); + } +#endif + } std::lock_guard guard(ctx_mutex); SSL_free(ssl); @@ -8366,7 +9119,7 @@ inline ssize_t SSLSocketStream::read(char *ptr, size_t size) { if (SSL_pending(ssl_) > 0) { return SSL_read(ssl_, ptr, static_cast(size)); } else if (is_readable()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); + std::this_thread::sleep_for(std::chrono::microseconds{10}); ret = SSL_read(ssl_, ptr, static_cast(size)); if (ret >= 0) { return ret; } err = SSL_get_error(ssl_, ret); @@ -8397,7 +9150,7 @@ inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) { while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) { #endif if (is_writable()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); + std::this_thread::sleep_for(std::chrono::microseconds{10}); ret = SSL_write(ssl_, ptr, static_cast(handle_size)); if (ret >= 0) { return ret; } err = SSL_get_error(ssl_, ret); @@ -8439,7 +9192,7 @@ inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path, SSL_OP_NO_COMPRESSION | SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION); + SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); if (private_key_password != nullptr && (private_key_password[0] != '\0')) { SSL_CTX_set_default_passwd_cb_userdata( @@ -8449,7 +9202,8 @@ inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path, if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 || SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != - 1) { + 1 || + SSL_CTX_check_private_key(ctx_) != 1) { SSL_CTX_free(ctx_); ctx_ = nullptr; } else if (client_ca_cert_file_path || client_ca_cert_dir_path) { @@ -8471,7 +9225,7 @@ inline SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key, SSL_OP_NO_COMPRESSION | SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION); + SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); if (SSL_CTX_use_certificate(ctx_, cert) != 1 || SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) { @@ -8505,6 +9259,19 @@ inline bool SSLServer::is_valid() const { return ctx_; } inline SSL_CTX *SSLServer::ssl_context() const { return ctx_; } +inline void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key, + X509_STORE *client_ca_cert_store) { + + std::lock_guard guard(ctx_mutex_); + + SSL_CTX_use_certificate(ctx_, cert); + SSL_CTX_use_PrivateKey(ctx_, private_key); + + if (client_ca_cert_store != nullptr) { + SSL_CTX_set_cert_store(ctx_, client_ca_cert_store); + } +} + inline bool SSLServer::process_and_close_socket(socket_t sock) { auto ssl = detail::ssl_new( sock, ctx_, ctx_mutex_, @@ -8516,20 +9283,29 @@ inline bool SSLServer::process_and_close_socket(socket_t sock) { auto ret = false; if (ssl) { + std::string remote_addr; + int remote_port = 0; + detail::get_remote_ip_and_port(sock, remote_addr, remote_port); + + std::string local_addr; + int local_port = 0; + detail::get_local_ip_and_port(sock, local_addr, local_port); + ret = detail::process_server_socket_ssl( svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, - [this, ssl](Stream &strm, bool close_connection, - bool &connection_closed) { - return process_request(strm, close_connection, connection_closed, + [&](Stream &strm, bool close_connection, bool &connection_closed) { + return process_request(strm, remote_addr, remote_port, local_addr, + local_port, close_connection, + connection_closed, [&](Request &req) { req.ssl = ssl; }); }); // Shutdown gracefully if the result seemed successful, non-gracefully if // the connection appeared to be closed. const bool shutdown_gracefully = ret; - detail::ssl_delete(ctx_mutex_, ssl, shutdown_gracefully); + detail::ssl_delete(ctx_mutex_, ssl, sock, shutdown_gracefully); } detail::shutdown_socket(sock); @@ -8551,6 +9327,8 @@ inline SSLClient::SSLClient(const std::string &host, int port, : ClientImpl(host, port, client_cert_path, client_key_path) { ctx_ = SSL_CTX_new(TLS_client_method()); + SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); + detail::split(&host_[0], &host_[host_.size()], '.', [&](const char *b, const char *e) { host_components_.emplace_back(b, e); @@ -8758,36 +9536,47 @@ inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) { } if (server_certificate_verification_) { - verify_result_ = SSL_get_verify_result(ssl2); + if (server_certificate_verifier_) { + if (!server_certificate_verifier_(ssl2)) { + error = Error::SSLServerVerification; + return false; + } + } else { + verify_result_ = SSL_get_verify_result(ssl2); - if (verify_result_ != X509_V_OK) { - error = Error::SSLServerVerification; - return false; + if (verify_result_ != X509_V_OK) { + error = Error::SSLServerVerification; + return false; + } + + auto server_cert = SSL_get1_peer_certificate(ssl2); + auto se = detail::scope_exit([&] { X509_free(server_cert); }); + + if (server_cert == nullptr) { + error = Error::SSLServerVerification; + return false; + } + + if (server_hostname_verification_) { + if (!verify_host(server_cert)) { + error = Error::SSLServerHostnameVerification; + return false; + } + } } - - auto server_cert = SSL_get1_peer_certificate(ssl2); - - if (server_cert == nullptr) { - error = Error::SSLServerVerification; - return false; - } - - if (!verify_host(server_cert)) { - X509_free(server_cert); - error = Error::SSLServerVerification; - return false; - } - X509_free(server_cert); } return true; }, [&](SSL *ssl2) { +#if defined(OPENSSL_IS_BORINGSSL) + SSL_set_tlsext_host_name(ssl2, host_.c_str()); +#else // NOTE: Direct call instead of using the OpenSSL macro to suppress // -Wold-style-cast warning - // SSL_set_tlsext_host_name(ssl2, host_.c_str()); SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name, static_cast(const_cast(host_.c_str()))); +#endif return true; }); @@ -8812,7 +9601,8 @@ inline void SSLClient::shutdown_ssl_impl(Socket &socket, return; } if (socket.ssl) { - detail::ssl_delete(ctx_mutex_, socket.ssl, shutdown_gracefully); + detail::ssl_delete(ctx_mutex_, socket.ssl, socket.sock, + shutdown_gracefully); socket.ssl = nullptr; } assert(socket.ssl == nullptr); @@ -8861,8 +9651,8 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const { auto type = GEN_DNS; - struct in6_addr addr6 {}; - struct in_addr addr {}; + struct in6_addr addr6{}; + struct in_addr addr{}; size_t addr_len = 0; #ifndef __MINGW32__ @@ -8965,7 +9755,7 @@ inline Client::Client(const std::string &scheme_host_port, const std::string &client_cert_path, const std::string &client_key_path) { const static std::regex re( - R"((?:([a-z]+):\/\/)?(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)"); + R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)"); std::smatch m; if (std::regex_match(scheme_host_port, m, re)) { @@ -9002,10 +9792,12 @@ inline Client::Client(const std::string &scheme_host_port, client_key_path); } } else { + // NOTE: Update TEST(UniversalClientImplTest, Ipv6LiteralAddress) + // if port param below changes. cli_ = detail::make_unique(scheme_host_port, 80, client_cert_path, client_key_path); } -} +} // namespace detail inline Client::Client(const std::string &host, int port) : cli_(detail::make_unique(host, port)) {} @@ -9111,15 +9903,30 @@ inline Result Client::Post(const std::string &path, const Headers &headers, const std::string &content_type) { return cli_->Post(path, headers, body, content_length, content_type); } +inline Result Client::Post(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, Progress progress) { + return cli_->Post(path, headers, body, content_length, content_type, + progress); +} inline Result Client::Post(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Post(path, body, content_type); } +inline Result Client::Post(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress) { + return cli_->Post(path, body, content_type, progress); +} inline Result Client::Post(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Post(path, headers, body, content_type); } +inline Result Client::Post(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, Progress progress) { + return cli_->Post(path, headers, body, content_type, progress); +} inline Result Client::Post(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type) { @@ -9150,6 +9957,10 @@ inline Result Client::Post(const std::string &path, const Headers &headers, const Params ¶ms) { return cli_->Post(path, headers, params); } +inline Result Client::Post(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress) { + return cli_->Post(path, headers, params, progress); +} inline Result Client::Post(const std::string &path, const MultipartFormDataItems &items) { return cli_->Post(path, items); @@ -9180,15 +9991,29 @@ inline Result Client::Put(const std::string &path, const Headers &headers, const std::string &content_type) { return cli_->Put(path, headers, body, content_length, content_type); } +inline Result Client::Put(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, Progress progress) { + return cli_->Put(path, headers, body, content_length, content_type, progress); +} inline Result Client::Put(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Put(path, body, content_type); } +inline Result Client::Put(const std::string &path, const std::string &body, + const std::string &content_type, Progress progress) { + return cli_->Put(path, body, content_type, progress); +} inline Result Client::Put(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Put(path, headers, body, content_type); } +inline Result Client::Put(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, Progress progress) { + return cli_->Put(path, headers, body, content_type, progress); +} inline Result Client::Put(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type) { @@ -9219,6 +10044,10 @@ inline Result Client::Put(const std::string &path, const Headers &headers, const Params ¶ms) { return cli_->Put(path, headers, params); } +inline Result Client::Put(const std::string &path, const Headers &headers, + const Params ¶ms, Progress progress) { + return cli_->Put(path, headers, params, progress); +} inline Result Client::Put(const std::string &path, const MultipartFormDataItems &items) { return cli_->Put(path, items); @@ -9246,20 +10075,44 @@ inline Result Client::Patch(const std::string &path, const char *body, const std::string &content_type) { return cli_->Patch(path, body, content_length, content_type); } +inline Result Client::Patch(const std::string &path, const char *body, + size_t content_length, + const std::string &content_type, + Progress progress) { + return cli_->Patch(path, body, content_length, content_type, progress); +} inline Result Client::Patch(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return cli_->Patch(path, headers, body, content_length, content_type); } +inline Result Client::Patch(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, + Progress progress) { + return cli_->Patch(path, headers, body, content_length, content_type, + progress); +} inline Result Client::Patch(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Patch(path, body, content_type); } +inline Result Client::Patch(const std::string &path, const std::string &body, + const std::string &content_type, + Progress progress) { + return cli_->Patch(path, body, content_type, progress); +} inline Result Client::Patch(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Patch(path, headers, body, content_type); } +inline Result Client::Patch(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, + Progress progress) { + return cli_->Patch(path, headers, body, content_type, progress); +} inline Result Client::Patch(const std::string &path, size_t content_length, ContentProvider content_provider, const std::string &content_type) { @@ -9294,20 +10147,44 @@ inline Result Client::Delete(const std::string &path, const char *body, const std::string &content_type) { return cli_->Delete(path, body, content_length, content_type); } +inline Result Client::Delete(const std::string &path, const char *body, + size_t content_length, + const std::string &content_type, + Progress progress) { + return cli_->Delete(path, body, content_length, content_type, progress); +} inline Result Client::Delete(const std::string &path, const Headers &headers, const char *body, size_t content_length, const std::string &content_type) { return cli_->Delete(path, headers, body, content_length, content_type); } +inline Result Client::Delete(const std::string &path, const Headers &headers, + const char *body, size_t content_length, + const std::string &content_type, + Progress progress) { + return cli_->Delete(path, headers, body, content_length, content_type, + progress); +} inline Result Client::Delete(const std::string &path, const std::string &body, const std::string &content_type) { return cli_->Delete(path, body, content_type); } +inline Result Client::Delete(const std::string &path, const std::string &body, + const std::string &content_type, + Progress progress) { + return cli_->Delete(path, body, content_type, progress); +} inline Result Client::Delete(const std::string &path, const Headers &headers, const std::string &body, const std::string &content_type) { return cli_->Delete(path, headers, body, content_type); } +inline Result Client::Delete(const std::string &path, const Headers &headers, + const std::string &body, + const std::string &content_type, + Progress progress) { + return cli_->Delete(path, headers, body, content_type, progress); +} inline Result Client::Options(const std::string &path) { return cli_->Options(path); } @@ -9417,6 +10294,15 @@ inline void Client::set_proxy_digest_auth(const std::string &username, inline void Client::enable_server_certificate_verification(bool enabled) { cli_->enable_server_certificate_verification(enabled); } + +inline void Client::enable_server_hostname_verification(bool enabled) { + cli_->enable_server_hostname_verification(enabled); +} + +inline void Client::set_server_certificate_verifier( + std::function verifier) { + cli_->set_server_certificate_verifier(verifier); +} #endif inline void Client::set_logger(Logger logger) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 64c0c4ef6..d1e8ee829 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -19,6 +19,7 @@ #include "loading.html.hpp" #include +#include #include #include #include @@ -32,6 +33,8 @@ using json = nlohmann::ordered_json; +constexpr int HTTP_POLLING_SECONDS = 1; + enum stop_type { STOP_TYPE_NONE, STOP_TYPE_EOS, @@ -1602,6 +1605,30 @@ struct server_response { // should never reach here } + // same as recv(), but have timeout in seconds + // if timeout is reached, nullptr is returned + server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { + while (true) { + std::unique_lock lock(mutex_results); + bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{ + return !queue_results.empty(); + }); + if (!cr_res) { + return nullptr; + } + + for (int i = 0; i < (int) queue_results.size(); i++) { + if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { + server_task_result_ptr res = std::move(queue_results[i]); + queue_results.erase(queue_results.begin() + i); + return res; + } + } + } + + // should never reach here + } + // single-task version of recv() server_task_result_ptr recv(int id_task) { std::unordered_set id_tasks = {id_task}; @@ -2322,10 +2349,21 @@ struct server_context { void receive_multi_results( const std::unordered_set & id_tasks, const std::function&)> & result_handler, - const std::function & error_handler) { + const std::function & error_handler, + const std::function & is_connection_closed) { std::vector results(id_tasks.size()); - for (size_t i = 0; i < id_tasks.size(); i++) { - server_task_result_ptr result = queue_results.recv(id_tasks); + for (int i = 0; i < (int)id_tasks.size(); i++) { + server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS); + + if (is_connection_closed()) { + cancel_tasks(id_tasks); + return; + } + + if (result == nullptr) { + i--; // retry + continue; + } if (result->is_error()) { error_handler(result->to_json()); @@ -2349,10 +2387,20 @@ struct server_context { void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const std::function & result_handler, - const std::function & error_handler) { + const std::function & error_handler, + const std::function & is_connection_closed) { size_t n_finished = 0; while (true) { - server_task_result_ptr result = queue_results.recv(id_tasks); + server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS); + + if (is_connection_closed()) { + cancel_tasks(id_tasks); + return; + } + + if (result == nullptr) { + continue; // retry + } if (result->is_error()) { error_handler(result->to_json()); @@ -3633,6 +3681,7 @@ int main(int argc, char ** argv) { const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok]( server_task_type type, json & data, + std::function is_connection_closed, httplib::Response & res, oaicompat_type oaicompat) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); @@ -3694,7 +3743,7 @@ int main(int argc, char ** argv) { } }, [&](const json & error_data) { res_error(res, error_data); - }); + }, is_connection_closed); ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { @@ -3704,6 +3753,7 @@ int main(int argc, char ** argv) { if (res_json.is_array()) { for (const auto & res : res_json) { if (!server_sent_event(sink, "data", res)) { + // sending failed (HTTP connection closed), cancel the generation return false; } } @@ -3713,6 +3763,9 @@ int main(int argc, char ** argv) { } }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); + }, [&sink]() { + // note: do not use req.is_connection_closed here because req is already destroyed + return !sink.is_writable(); }); if (oaicompat != OAICOMPAT_TYPE_NONE) { static const std::string ev_done = "data: [DONE]\n\n"; @@ -3735,6 +3788,7 @@ int main(int argc, char ** argv) { return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, data, + req.is_connection_closed, res, OAICOMPAT_TYPE_NONE); }; @@ -3744,6 +3798,7 @@ int main(int argc, char ** argv) { return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, data, + req.is_connection_closed, res, OAICOMPAT_TYPE_COMPLETION); }; @@ -3820,6 +3875,7 @@ int main(int argc, char ** argv) { return handle_completions_impl( SERVER_TASK_TYPE_INFILL, data, + req.is_connection_closed, res, OAICOMPAT_TYPE_NONE); // infill is not OAI compatible }; @@ -3834,6 +3890,7 @@ int main(int argc, char ** argv) { return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, data, + req.is_connection_closed, res, OAICOMPAT_TYPE_CHAT); }; @@ -3980,7 +4037,7 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { res_error(res, error_data); error = true; - }); + }, req.is_connection_closed); ctx_server.queue_results.remove_waiting_task_ids(task_ids); } @@ -4070,7 +4127,7 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { res_error(res, error_data); error = true; - }); + }, req.is_connection_closed); } if (error) { diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index e5e3b6077..c1fc12462 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -1,4 +1,5 @@ import pytest +import requests import time from openai import OpenAI from utils import * @@ -405,3 +406,23 @@ def test_n_probs_post_sampling(): assert "bytes" in prob and type(prob["bytes"]) == list # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs assert any(prob["prob"] == 1.0 for prob in tok["top_probs"]) + + +def test_cancel_request(): + global server + server.n_ctx = 4096 + server.n_predict = -1 + server.n_slots = 1 + server.server_slots = True + server.start() + # send a request that will take a long time, but cancel it before it finishes + try: + server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + }, timeout=0.1) + except requests.exceptions.ReadTimeout: + pass # expected + # make sure the slot is free + time.sleep(1) # wait for HTTP_POLLING_SECONDS + res = server.make_request("GET", "/slots") + assert res.body[0]["is_processing"] == False diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index a1a94d0f1..73be4c92f 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -219,17 +219,18 @@ class ServerProcess: path: str, data: dict | Any | None = None, headers: dict | None = None, + timeout: float | None = None, ) -> ServerResponse: url = f"http://{self.server_host}:{self.server_port}{path}" parse_body = False if method == "GET": - response = requests.get(url, headers=headers) + response = requests.get(url, headers=headers, timeout=timeout) parse_body = True elif method == "POST": - response = requests.post(url, headers=headers, json=data) + response = requests.post(url, headers=headers, json=data, timeout=timeout) parse_body = True elif method == "OPTIONS": - response = requests.options(url, headers=headers) + response = requests.options(url, headers=headers, timeout=timeout) else: raise ValueError(f"Unimplemented method: {method}") result = ServerResponse() From 4dd34ff83165a483ebff7bd43621b28490fa1fd6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 18 Jan 2025 16:18:15 +0200 Subject: [PATCH 24/30] cmake : add sanitizer flags for llama.cpp (#11279) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cmake : add sanitizer flags for llama.cpp ggml-ci * tests : fix compile warnings ggml-ci * cmake : move sanitizer flags to llama_add_compile_flags ggml-ci * cmake : move llama.cpp compile flags to top level lists ggml-ci * cmake : apply only sanitizer flags at top level ggml-ci * tests : fix gguf context use in same_tensor_data * gguf-test: tensor data comparison * dummy : trigger ggml-ci * unicode : silence gcc warnings ggml-ci * ci : use sanitizer builds only in Debug mode ggml-ci * cmake : add status messages [no ci] --------- Co-authored-by: Johannes Gäßler --- .github/workflows/build.yml | 2 +- CMakeLists.txt | 73 +++++++++++++++++++++++++------------ ggml/src/gguf.cpp | 4 ++ src/unicode.cpp | 5 +-- tests/CMakeLists.txt | 2 + tests/test-gguf.cpp | 28 ++++++++------ tests/test-sampling.cpp | 1 - 7 files changed, 74 insertions(+), 41 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c85999b89..9e0c4a675 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -234,7 +234,7 @@ jobs: strategy: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] + build_type: [Debug] steps: - name: Clone diff --git a/CMakeLists.txt b/CMakeLists.txt index a717a508f..42caed486 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,11 +83,8 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) # override ggml options -set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) -set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) -set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) -set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) -set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) +set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) +set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) # change the default for these ggml options if (NOT DEFINED GGML_LLAMAFILE) @@ -117,16 +114,62 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +if (NOT MSVC) + if (LLAMA_SANITIZE_THREAD) + message(STATUS "Using -fsanitize=thread") + + add_compile_options(-fsanitize=thread) + link_libraries (-fsanitize=thread) + endif() + + if (LLAMA_SANITIZE_ADDRESS) + message(STATUS "Using -fsanitize=address") + + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries (-fsanitize=address) + endif() + + if (LLAMA_SANITIZE_UNDEFINED) + message(STATUS "Using -fsanitize=undefined") + + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() +endif() + # -# build the library +# 3rd-party # if (NOT TARGET ggml) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() + +# +# build the library +# + add_subdirectory(src) +# +# utils, programs, examples and tests +# + +if (LLAMA_BUILD_COMMON) + add_subdirectory(common) +endif() + +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) + include(CTest) + add_subdirectory(tests) +endif() + +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) + add_subdirectory(pocs) +endif() + # # install # @@ -200,21 +243,3 @@ configure_file(cmake/llama.pc.in install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" DESTINATION lib/pkgconfig) - -# -# utils, programs, examples and tests -# - -if (LLAMA_BUILD_COMMON) - add_subdirectory(common) -endif() - -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) - include(CTest) - add_subdirectory(tests) -endif() - -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) - add_subdirectory(examples) - add_subdirectory(pocs) -endif() diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 655ed600a..ab13669c5 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -648,6 +648,10 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par ok = ok && data != nullptr; + if (ok) { + ggml_set_name(data, "GGUF tensor data binary blob"); + } + // read the binary blob with the tensor data ok = ok && gr.read(data->data, ctx->size); diff --git a/src/unicode.cpp b/src/unicode.cpp index 7aca6544b..89180da41 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -7,18 +7,17 @@ #include #include +#include #include #include +#include #include #include #include #include #include -#include #include #include -#include -#include size_t unicode_len_utf8(char src) { const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2b5e5fd4a..3fa43c295 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,5 @@ +llama_add_compile_flags() + function(llama_test target) include(CMakeParseArguments) set(options) diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 611957ac0..6ed696328 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -48,7 +48,7 @@ enum handcrafted_file_type { HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data, }; -std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) { +static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) { switch (hft) { case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC"; case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1"; @@ -99,7 +99,7 @@ static bool expect_context_not_null(const enum handcrafted_file_type hft) { typedef std::pair> tensor_config_t; -std::vector get_tensor_configs(std::mt19937 & rng) { +static std::vector get_tensor_configs(std::mt19937 & rng) { std::vector tensor_configs; tensor_configs.reserve(100); @@ -122,7 +122,7 @@ std::vector get_tensor_configs(std::mt19937 & rng) { return tensor_configs; } -std::vector> get_kv_types(std::mt19937 rng) { +static std::vector> get_kv_types(std::mt19937 rng) { std::vector> kv_types; kv_types.reserve(100); @@ -626,8 +626,6 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u bool ok = true; - const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT; - for (int i = 0; i < int(tensor_configs.size()); ++i) { const ggml_type type = tensor_configs[i].first; const std::array shape = tensor_configs[i].second; @@ -866,13 +864,13 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t case GGUF_TYPE_COUNT: default: { GGML_ABORT("fatal error"); - } break; + } } } break; case GGUF_TYPE_COUNT: default: { GGML_ABORT("fatal error"); - } break; + } } } @@ -938,7 +936,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other } if (type == GGUF_TYPE_ARRAY) { - const int arr_n = gguf_get_arr_n(ctx, id); + const size_t arr_n = gguf_get_arr_n(ctx, id); if (arr_n != gguf_get_arr_n(other, idx_other)) { ok = false; continue; @@ -953,7 +951,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other if (type_arr == GGUF_TYPE_BOOL) { const int8_t * data = reinterpret_cast(gguf_get_arr_data(ctx, id)); const int8_t * data_other = reinterpret_cast(gguf_get_arr_data(other, idx_other)); - for (int arr_i = 0; arr_i < arr_n; ++arr_i) { + for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) { if (bool(data[arr_i]) != bool(data_other[arr_i])) { ok = false; } @@ -962,7 +960,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other } if (type_arr == GGUF_TYPE_STRING) { - for (int arr_i = 0; arr_i < arr_n; ++arr_i) { + for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) { const std::string str = gguf_get_arr_str(ctx, id, arr_i); const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i); if (str != str_other) { @@ -1033,6 +1031,12 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml struct ggml_tensor * t_orig = ggml_get_first_tensor(orig); struct ggml_tensor * t_read = ggml_get_first_tensor(read); + + if (std::string(t_read->name) != "GGUF tensor data binary blob") { + return false; + } + t_read = ggml_get_next_tensor(read, t_read); + while (t_orig) { if (!t_read) { ok = false; @@ -1051,13 +1055,13 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml } t_orig = ggml_get_next_tensor(orig, t_orig); - t_read = ggml_get_next_tensor(orig, t_read); + t_read = ggml_get_next_tensor(read, t_read); } if (t_read) { ok = false; } - return true; + return ok; } static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) { diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index c0dcb4848..61bd67850 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -144,7 +144,6 @@ static void test_penalties( sampler_tester tester(probs, probs_expected); - const size_t n_vocab = probs.size(); auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence); for (size_t i = 0; i < last_tokens.size(); i++) { From a1649cc13f89946322358f92ea268ae1b7b5096c Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Sat, 18 Jan 2025 14:42:31 +0000 Subject: [PATCH 25/30] Adding linenoise.cpp to llama-run (#11252) This is a fork of linenoise that is C++17 compatible. I intend on adding it to llama-run so we can do things like traverse prompt history via the up and down arrows: https://github.com/ericcurtin/linenoise.cpp Signed-off-by: Eric Curtin --- .github/workflows/build.yml | 4 + examples/run/CMakeLists.txt | 2 +- examples/run/linenoise.cpp/LICENSE | 26 + examples/run/linenoise.cpp/linenoise.cpp | 1351 ++++++++++++++++++++++ examples/run/linenoise.cpp/linenoise.h | 114 ++ examples/run/run.cpp | 38 +- 6 files changed, 1524 insertions(+), 11 deletions(-) create mode 100644 examples/run/linenoise.cpp/LICENSE create mode 100644 examples/run/linenoise.cpp/linenoise.cpp create mode 100644 examples/run/linenoise.cpp/linenoise.h diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9e0c4a675..fe3b2cdfa 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -87,6 +87,7 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ + cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - name: Upload artifacts @@ -149,6 +150,7 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ + cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - name: Upload artifacts @@ -217,6 +219,7 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | cp LICENSE ./build/bin/ + cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* - name: Upload artifacts @@ -796,6 +799,7 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt + Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - name: Upload artifacts diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 0686d6305..cd6b0520e 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-run) -add_executable(${TARGET} run.cpp) +add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/linenoise.cpp/LICENSE b/examples/run/linenoise.cpp/LICENSE new file mode 100644 index 000000000..b006b3b24 --- /dev/null +++ b/examples/run/linenoise.cpp/LICENSE @@ -0,0 +1,26 @@ +Copyright (c) 2010-2014, Salvatore Sanfilippo +Copyright (c) 2010-2013, Pieter Noordhuis +Copyright (c) 2025, Eric Curtin + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/examples/run/linenoise.cpp/linenoise.cpp b/examples/run/linenoise.cpp/linenoise.cpp new file mode 100644 index 000000000..050c23012 --- /dev/null +++ b/examples/run/linenoise.cpp/linenoise.cpp @@ -0,0 +1,1351 @@ +#ifndef _WIN32 +/* + * You can find the latest source code at: + * + * http://github.com/ericcurtin/linenoise.cpp + * + * Does a number of crazy assumptions that happen to be true in 99.9999% of + * the 2010 UNIX computers around. + * + * ------------------------------------------------------------------------ + * + * Copyright (c) 2010-2023, Salvatore Sanfilippo + * Copyright (c) 2010-2013, Pieter Noordhuis + * Copyright (c) 2025, Eric Curtin + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ------------------------------------------------------------------------ + * + * References: + * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html + * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html + * + * Todo list: + * - Filter bogus Ctrl+ combinations. + * - Win32 support + * + * Bloat: + * - History search like Ctrl+r in readline? + * + * List of escape sequences used by this program, we do everything just + * with three sequences. In order to be so cheap we may have some + * flickering effect with some slow terminal, but the lesser sequences + * the more compatible. + * + * EL (Erase Line) + * Sequence: ESC [ n K + * Effect: if n is 0 or missing, clear from cursor to end of line + * Effect: if n is 1, clear from beginning of line to cursor + * Effect: if n is 2, clear entire line + * + * CUF (CUrsor Forward) + * Sequence: ESC [ n C + * Effect: moves cursor forward n chars + * + * CUB (CUrsor Backward) + * Sequence: ESC [ n D + * Effect: moves cursor backward n chars + * + * The following is used to get the terminal width if getting + * the width with the TIOCGWINSZ ioctl fails + * + * DSR (Device Status Report) + * Sequence: ESC [ 6 n + * Effect: reports the current cusor position as ESC [ n ; m R + * where n is the row and m is the column + * + * When multi line mode is enabled, we also use an additional escape + * sequence. However multi line editing is disabled by default. + * + * CUU (Cursor Up) + * Sequence: ESC [ n A + * Effect: moves cursor up of n chars. + * + * CUD (Cursor Down) + * Sequence: ESC [ n B + * Effect: moves cursor down of n chars. + * + * When linenoiseClearScreen() is called, two additional escape sequences + * are used in order to clear the screen and position the cursor at home + * position. + * + * CUP (Cursor position) + * Sequence: ESC [ H + * Effect: moves the cursor to upper left corner + * + * ED (Erase display) + * Sequence: ESC [ 2 J + * Effect: clear the whole screen + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "linenoise.h" + +#define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100 +#define LINENOISE_MAX_LINE 4096 +static std::vector unsupported_term = {"dumb","cons25","emacs",nullptr}; +static linenoiseCompletionCallback *completionCallback = NULL; +static linenoiseHintsCallback *hintsCallback = NULL; +static linenoiseFreeHintsCallback *freeHintsCallback = NULL; +static char *linenoiseNoTTY(void); +static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags); +static void refreshLineWithFlags(struct linenoiseState *l, int flags); + +static struct termios orig_termios; /* In order to restore at exit.*/ +static int maskmode = 0; /* Show "***" instead of input. For passwords. */ +static int rawmode = 0; /* For atexit() function to check if restore is needed*/ +static int mlmode = 0; /* Multi line mode. Default is single line. */ +static int atexit_registered = 0; /* Register atexit just 1 time. */ +static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; +static int history_len = 0; +static char **history = NULL; + +enum KEY_ACTION{ + KEY_NULL = 0, /* NULL */ + CTRL_A = 1, /* Ctrl+a */ + CTRL_B = 2, /* Ctrl-b */ + CTRL_C = 3, /* Ctrl-c */ + CTRL_D = 4, /* Ctrl-d */ + CTRL_E = 5, /* Ctrl-e */ + CTRL_F = 6, /* Ctrl-f */ + CTRL_H = 8, /* Ctrl-h */ + TAB = 9, /* Tab */ + CTRL_K = 11, /* Ctrl+k */ + CTRL_L = 12, /* Ctrl+l */ + ENTER = 13, /* Enter */ + CTRL_N = 14, /* Ctrl-n */ + CTRL_P = 16, /* Ctrl-p */ + CTRL_T = 20, /* Ctrl-t */ + CTRL_U = 21, /* Ctrl+u */ + CTRL_W = 23, /* Ctrl+w */ + ESC = 27, /* Escape */ + BACKSPACE = 127 /* Backspace */ +}; + +static void linenoiseAtExit(void); +int linenoiseHistoryAdd(const char *line); +#define REFRESH_CLEAN (1<<0) // Clean the old prompt from the screen +#define REFRESH_WRITE (1<<1) // Rewrite the prompt on the screen. +#define REFRESH_ALL (REFRESH_CLEAN|REFRESH_WRITE) // Do both. +static void refreshLine(struct linenoiseState *l); + +__attribute__((format(printf, 1, 2))) +/* Debugging function. */ +#if 0 +static void lndebug(const char *fmt, ...) { + static FILE *lndebug_fp = NULL; + if (lndebug_fp == NULL) { + lndebug_fp = fopen("/tmp/lndebug.txt", "a"); + } + + if (lndebug_fp != NULL) { + va_list args; + va_start(args, fmt); + vfprintf(lndebug_fp, fmt, args); + va_end(args); + fflush(lndebug_fp); + } +} +#else +static void lndebug(const char *, ...) { +} +#endif + +/* ======================= Low level terminal handling ====================== */ + +/* Enable "mask mode". When it is enabled, instead of the input that + * the user is typing, the terminal will just display a corresponding + * number of asterisks, like "****". This is useful for passwords and other + * secrets that should not be displayed. */ +void linenoiseMaskModeEnable(void) { + maskmode = 1; +} + +/* Disable mask mode. */ +void linenoiseMaskModeDisable(void) { + maskmode = 0; +} + +/* Set if to use or not the multi line mode. */ +void linenoiseSetMultiLine(int ml) { + mlmode = ml; +} + +/* Return true if the terminal name is in the list of terminals we know are + * not able to understand basic escape sequences. */ +static int isUnsupportedTerm(void) { + char *term = getenv("TERM"); + if (term == NULL) return 0; + for (int j = 0; unsupported_term[j]; ++j) + if (!strcasecmp(term, unsupported_term[j])) return 1; + return 0; +} + +/* Raw mode: 1960 magic shit. */ +static int enableRawMode(int fd) { + struct termios raw; + + if (!isatty(STDIN_FILENO)) goto fatal; + if (!atexit_registered) { + atexit(linenoiseAtExit); + atexit_registered = 1; + } + if (tcgetattr(fd,&orig_termios) == -1) goto fatal; + + raw = orig_termios; /* modify the original mode */ + /* input modes: no break, no CR to NL, no parity check, no strip char, + * no start/stop output control. */ + raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON); + /* output modes - disable post processing */ + raw.c_oflag &= ~(OPOST); + /* control modes - set 8 bit chars */ + raw.c_cflag |= (CS8); + /* local modes - choing off, canonical off, no extended functions, + * no signal chars (^Z,^C) */ + raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG); + /* control chars - set return condition: min number of bytes and timer. + * We want read to return every single byte, without timeout. */ + raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */ + + /* put terminal in raw mode after flushing */ + if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal; + rawmode = 1; + return 0; + +fatal: + errno = ENOTTY; + return -1; +} + +static void disableRawMode(int fd) { + /* Don't even check the return value as it's too late. */ + if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) + rawmode = 0; +} + +/* Use the ESC [6n escape sequence to query the horizontal cursor position + * and return it. On error -1 is returned, on success the position of the + * cursor. */ +static int getCursorPosition(int ifd, int ofd) { + char buf[32]; + int cols, rows; + unsigned int i = 0; + + /* Report cursor location */ + if (write(ofd, "\x1b[6n", 4) != 4) return -1; + + /* Read the response: ESC [ rows ; cols R */ + while (i < sizeof(buf)-1) { + if (read(ifd,buf+i,1) != 1) break; + if (buf[i] == 'R') break; + i++; + } + buf[i] = '\0'; + + /* Parse it. */ + if (buf[0] != ESC || buf[1] != '[') return -1; + if (sscanf(buf+2,"%d;%d",&rows,&cols) != 2) return -1; + return cols; +} + +/* Try to get the number of columns in the current terminal, or assume 80 + * if it fails. */ +static int getColumns(int ifd, int ofd) { + struct winsize ws; + + if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) { + /* ioctl() failed. Try to query the terminal itself. */ + int start, cols; + + /* Get the initial position so we can restore it later. */ + start = getCursorPosition(ifd,ofd); + if (start == -1) goto failed; + + /* Go to right margin and get position. */ + if (write(ofd,"\x1b[999C",6) != 6) goto failed; + cols = getCursorPosition(ifd,ofd); + if (cols == -1) goto failed; + + /* Restore position. */ + if (cols > start) { + char seq[32]; + snprintf(seq,32,"\x1b[%dD",cols-start); + if (write(ofd,seq,strlen(seq)) == -1) { + /* Can't recover... */ + } + } + return cols; + } else { + return ws.ws_col; + } + +failed: + return 80; +} + +/* Clear the screen. Used to handle ctrl+l */ +void linenoiseClearScreen(void) { + if (write(STDOUT_FILENO,"\x1b[H\x1b[2J",7) <= 0) { + /* nothing to do, just to avoid warning. */ + } +} + +/* Beep, used for completion when there is nothing to complete or when all + * the choices were already shown. */ +static void linenoiseBeep(void) { + fprintf(stderr, "\x7"); + fflush(stderr); +} + +/* ============================== Completion ================================ */ + +/* Free a list of completion option populated by linenoiseAddCompletion(). */ +static void freeCompletions(linenoiseCompletions *lc) { + size_t i; + for (i = 0; i < lc->len; i++) + free(lc->cvec[i]); + if (lc->cvec != NULL) + free(lc->cvec); +} + +/* Called by completeLine() and linenoiseShow() to render the current + * edited line with the proposed completion. If the current completion table + * is already available, it is passed as second argument, otherwise the + * function will use the callback to obtain it. + * + * Flags are the same as refreshLine*(), that is REFRESH_* macros. */ +static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags) { + /* Obtain the table of completions if the caller didn't provide one. */ + linenoiseCompletions ctable = { 0, NULL }; + if (lc == NULL) { + completionCallback(ls->buf,&ctable); + lc = &ctable; + } + + /* Show the edited line with completion if possible, or just refresh. */ + if (ls->completion_idx < lc->len) { + struct linenoiseState saved = *ls; + ls->len = ls->pos = strlen(lc->cvec[ls->completion_idx]); + ls->buf = lc->cvec[ls->completion_idx]; + refreshLineWithFlags(ls,flags); + ls->len = saved.len; + ls->pos = saved.pos; + ls->buf = saved.buf; + } else { + refreshLineWithFlags(ls,flags); + } + + /* Free the completions table if needed. */ + if (lc != &ctable) freeCompletions(&ctable); +} + +/* This is an helper function for linenoiseEdit*() and is called when the + * user types the key in order to complete the string currently in the + * input. + * + * The state of the editing is encapsulated into the pointed linenoiseState + * structure as described in the structure definition. + * + * If the function returns non-zero, the caller should handle the + * returned value as a byte read from the standard input, and process + * it as usually: this basically means that the function may return a byte + * read from the termianl but not processed. Otherwise, if zero is returned, + * the input was consumed by the completeLine() function to navigate the + * possible completions, and the caller should read for the next characters + * from stdin. */ +static int completeLine(struct linenoiseState *ls, int keypressed) { + linenoiseCompletions lc = { 0, NULL }; + int nwritten; + char c = keypressed; + + completionCallback(ls->buf,&lc); + if (lc.len == 0) { + linenoiseBeep(); + ls->in_completion = 0; + } else { + switch(c) { + case 9: /* tab */ + if (ls->in_completion == 0) { + ls->in_completion = 1; + ls->completion_idx = 0; + } else { + ls->completion_idx = (ls->completion_idx+1) % (lc.len+1); + if (ls->completion_idx == lc.len) linenoiseBeep(); + } + c = 0; + break; + case 27: /* escape */ + /* Re-show original buffer */ + if (ls->completion_idx < lc.len) refreshLine(ls); + ls->in_completion = 0; + c = 0; + break; + default: + /* Update buffer and return */ + if (ls->completion_idx < lc.len) { + nwritten = snprintf(ls->buf,ls->buflen,"%s", + lc.cvec[ls->completion_idx]); + ls->len = ls->pos = nwritten; + } + ls->in_completion = 0; + break; + } + + /* Show completion or original buffer */ + if (ls->in_completion && ls->completion_idx < lc.len) { + refreshLineWithCompletion(ls,&lc,REFRESH_ALL); + } else { + refreshLine(ls); + } + } + + freeCompletions(&lc); + return c; /* Return last read character */ +} + +/* Register a callback function to be called for tab-completion. */ +void linenoiseSetCompletionCallback(linenoiseCompletionCallback *fn) { + completionCallback = fn; +} + +/* Register a hits function to be called to show hits to the user at the + * right of the prompt. */ +void linenoiseSetHintsCallback(linenoiseHintsCallback *fn) { + hintsCallback = fn; +} + +/* Register a function to free the hints returned by the hints callback + * registered with linenoiseSetHintsCallback(). */ +void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *fn) { + freeHintsCallback = fn; +} + +/* This function is used by the callback function registered by the user + * in order to add completion options given the input string when the + * user typed . See the example.c source code for a very easy to + * understand example. */ +void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) { + size_t len = strlen(str); + char *copy, **cvec; + + copy = (char*) malloc(len + 1); + if (copy == NULL) return; + memcpy(copy,str,len+1); + cvec = (char**) realloc(lc->cvec,sizeof(char*)*(lc->len+1)); + if (cvec == NULL) { + free(copy); + return; + } + lc->cvec = cvec; + lc->cvec[lc->len++] = copy; +} + +/* =========================== Line editing ================================= */ + +/* We define a very simple "append buffer" structure, that is an heap + * allocated string where we can append to. This is useful in order to + * write all the escape sequences in a buffer and flush them to the standard + * output in a single call, to avoid flickering effects. */ +struct abuf { + char *b; + int len; +}; + +static void abInit(struct abuf *ab) { + ab->b = NULL; + ab->len = 0; +} + +static void abAppend(struct abuf *ab, const char *s, int len) { + char *new_ptr = (char*) realloc(ab->b,ab->len+len); + + if (new_ptr == NULL) return; + memcpy(new_ptr+ab->len,s,len); + ab->b = new_ptr; + ab->len += len; +} + +static void abFree(struct abuf *ab) { + free(ab->b); +} + +/* Helper of refreshSingleLine() and refreshMultiLine() to show hints + * to the right of the prompt. */ +static void refreshShowHints(struct abuf * ab, struct linenoiseState * l, int plen) { + char seq[64]; + if (hintsCallback && plen+l->len < l->cols) { + int color = -1, bold = 0; + const char *hint = hintsCallback(l->buf,&color,&bold); + if (hint) { + int hintlen = strlen(hint); + int hintmaxlen = l->cols-(plen+l->len); + if (hintlen > hintmaxlen) hintlen = hintmaxlen; + if (bold == 1 && color == -1) color = 37; + if (color != -1 || bold != 0) + snprintf(seq,64,"\033[%d;%d;49m",bold,color); + else + seq[0] = '\0'; + abAppend(ab,seq,strlen(seq)); + abAppend(ab,hint,hintlen); + if (color != -1 || bold != 0) + abAppend(ab,"\033[0m",4); + /* Call the function to free the hint returned. */ + if (freeHintsCallback) freeHintsCallback(hint); + } + } +} + +/* Single line low level line refresh. + * + * Rewrite the currently edited line accordingly to the buffer content, + * cursor position, and number of columns of the terminal. + * + * Flags is REFRESH_* macros. The function can just remove the old + * prompt, just write it, or both. */ +static void refreshSingleLine(struct linenoiseState *l, int flags) { + char seq[64]; + size_t plen = strlen(l->prompt); + int fd = l->ofd; + char *buf = l->buf; + size_t len = l->len; + size_t pos = l->pos; + struct abuf ab; + + while((plen+pos) >= l->cols) { + buf++; + len--; + pos--; + } + while (plen+len > l->cols) { + len--; + } + + abInit(&ab); + /* Cursor to left edge */ + snprintf(seq,sizeof(seq),"\r"); + abAppend(&ab,seq,strlen(seq)); + + if (flags & REFRESH_WRITE) { + /* Write the prompt and the current buffer content */ + abAppend(&ab,l->prompt,strlen(l->prompt)); + if (maskmode == 1) { + while (len--) abAppend(&ab,"*",1); + } else { + abAppend(&ab,buf,len); + } + /* Show hits if any. */ + refreshShowHints(&ab,l,plen); + } + + /* Erase to right */ + snprintf(seq,sizeof(seq),"\x1b[0K"); + abAppend(&ab,seq,strlen(seq)); + + if (flags & REFRESH_WRITE) { + /* Move cursor to original position. */ + snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen)); + abAppend(&ab,seq,strlen(seq)); + } + + if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */ + abFree(&ab); +} + +/* Multi line low level line refresh. + * + * Rewrite the currently edited line accordingly to the buffer content, + * cursor position, and number of columns of the terminal. + * + * Flags is REFRESH_* macros. The function can just remove the old + * prompt, just write it, or both. */ +static void refreshMultiLine(struct linenoiseState *l, int flags) { + char seq[64]; + int plen = strlen(l->prompt); + int rows = (plen+l->len+l->cols-1)/l->cols; /* rows used by current buf. */ + int rpos = (plen+l->oldpos+l->cols)/l->cols; /* cursor relative row. */ + int rpos2; /* rpos after refresh. */ + int col; /* colum position, zero-based. */ + int old_rows = l->oldrows; + int fd = l->ofd, j; + struct abuf ab; + + l->oldrows = rows; + + /* First step: clear all the lines used before. To do so start by + * going to the last row. */ + abInit(&ab); + + if (flags & REFRESH_CLEAN) { + if (old_rows-rpos > 0) { + lndebug("go down %d", old_rows-rpos); + snprintf(seq,64,"\x1b[%dB", old_rows-rpos); + abAppend(&ab,seq,strlen(seq)); + } + + /* Now for every row clear it, go up. */ + for (j = 0; j < old_rows-1; j++) { + lndebug("clear+up"); + snprintf(seq,64,"\r\x1b[0K\x1b[1A"); + abAppend(&ab,seq,strlen(seq)); + } + } + + if (flags & REFRESH_ALL) { + /* Clean the top line. */ + lndebug("clear"); + snprintf(seq,64,"\r\x1b[0K"); + abAppend(&ab,seq,strlen(seq)); + } + + if (flags & REFRESH_WRITE) { + /* Write the prompt and the current buffer content */ + abAppend(&ab,l->prompt,strlen(l->prompt)); + if (maskmode == 1) { + unsigned int i; + for (i = 0; i < l->len; i++) abAppend(&ab,"*",1); + } else { + abAppend(&ab,l->buf,l->len); + } + + /* Show hits if any. */ + refreshShowHints(&ab,l,plen); + + /* If we are at the very end of the screen with our prompt, we need to + * emit a newline and move the prompt to the first column. */ + if (l->pos && + l->pos == l->len && + (l->pos+plen) % l->cols == 0) + { + lndebug(""); + abAppend(&ab,"\n",1); + snprintf(seq,64,"\r"); + abAppend(&ab,seq,strlen(seq)); + rows++; + if (rows > (int)l->oldrows) l->oldrows = rows; + } + + /* Move cursor to right position. */ + rpos2 = (plen+l->pos+l->cols)/l->cols; /* Current cursor relative row */ + lndebug("rpos2 %d", rpos2); + + /* Go up till we reach the expected positon. */ + if (rows-rpos2 > 0) { + lndebug("go-up %d", rows-rpos2); + snprintf(seq,64,"\x1b[%dA", rows-rpos2); + abAppend(&ab,seq,strlen(seq)); + } + + /* Set column. */ + col = (plen+(int)l->pos) % (int)l->cols; + lndebug("set col %d", 1+col); + if (col) + snprintf(seq,64,"\r\x1b[%dC", col); + else + snprintf(seq,64,"\r"); + abAppend(&ab,seq,strlen(seq)); + } + + lndebug("\n"); + l->oldpos = l->pos; + + if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */ + abFree(&ab); +} + +/* Calls the two low level functions refreshSingleLine() or + * refreshMultiLine() according to the selected mode. */ +static void refreshLineWithFlags(struct linenoiseState *l, int flags) { + if (mlmode) + refreshMultiLine(l,flags); + else + refreshSingleLine(l,flags); +} + +/* Utility function to avoid specifying REFRESH_ALL all the times. */ +static void refreshLine(struct linenoiseState *l) { + refreshLineWithFlags(l,REFRESH_ALL); +} + +/* Hide the current line, when using the multiplexing API. */ +void linenoiseHide(struct linenoiseState *l) { + if (mlmode) + refreshMultiLine(l,REFRESH_CLEAN); + else + refreshSingleLine(l,REFRESH_CLEAN); +} + +/* Show the current line, when using the multiplexing API. */ +void linenoiseShow(struct linenoiseState *l) { + if (l->in_completion) { + refreshLineWithCompletion(l,NULL,REFRESH_WRITE); + } else { + refreshLineWithFlags(l,REFRESH_WRITE); + } +} + +/* Insert the character 'c' at cursor current position. + * + * On error writing to the terminal -1 is returned, otherwise 0. */ +static int linenoiseEditInsert(struct linenoiseState * l, char c) { + if (l->len < l->buflen) { + if (l->len == l->pos) { + l->buf[l->pos] = c; + l->pos++; + l->len++; + l->buf[l->len] = '\0'; + if ((!mlmode && l->plen+l->len < l->cols && !hintsCallback)) { + /* Avoid a full update of the line in the + * trivial case. */ + char d = (maskmode==1) ? '*' : c; + if (write(l->ofd,&d,1) == -1) return -1; + } else { + refreshLine(l); + } + } else { + memmove(l->buf+l->pos+1,l->buf+l->pos,l->len-l->pos); + l->buf[l->pos] = c; + l->len++; + l->pos++; + l->buf[l->len] = '\0'; + refreshLine(l); + } + } + return 0; +} + +/* Move cursor on the left. */ +static void linenoiseEditMoveLeft(struct linenoiseState * l) { + if (l->pos > 0) { + l->pos--; + refreshLine(l); + } +} + +/* Move cursor on the right. */ +static void linenoiseEditMoveRight(struct linenoiseState * l) { + if (l->pos != l->len) { + l->pos++; + refreshLine(l); + } +} + +/* Move cursor to the start of the line. */ +static void linenoiseEditMoveHome(struct linenoiseState * l) { + if (l->pos != 0) { + l->pos = 0; + refreshLine(l); + } +} + +/* Move cursor to the end of the line. */ +static void linenoiseEditMoveEnd(struct linenoiseState * l) { + if (l->pos != l->len) { + l->pos = l->len; + refreshLine(l); + } +} + +/* Substitute the currently edited line with the next or previous history + * entry as specified by 'dir'. */ +#define LINENOISE_HISTORY_NEXT 0 +#define LINENOISE_HISTORY_PREV 1 + +static void linenoiseEditHistoryNext(struct linenoiseState * l, int dir) { + if (history_len > 1) { + /* Update the current history entry before to + * overwrite it with the next one. */ + free(history[history_len - 1 - l->history_index]); + history[history_len - 1 - l->history_index] = strdup(l->buf); + /* Show the new entry */ + l->history_index += (dir == LINENOISE_HISTORY_PREV) ? 1 : -1; + if (l->history_index < 0) { + l->history_index = 0; + return; + } else if (l->history_index >= history_len) { + l->history_index = history_len-1; + return; + } + strncpy(l->buf,history[history_len - 1 - l->history_index],l->buflen); + l->buf[l->buflen-1] = '\0'; + l->len = l->pos = strlen(l->buf); + refreshLine(l); + } +} + +/* Delete the character at the right of the cursor without altering the cursor + * position. Basically this is what happens with the "Delete" keyboard key. */ +static void linenoiseEditDelete(struct linenoiseState * l) { + if (l->len > 0 && l->pos < l->len) { + memmove(l->buf+l->pos,l->buf+l->pos+1,l->len-l->pos-1); + l->len--; + l->buf[l->len] = '\0'; + refreshLine(l); + } +} + +/* Backspace implementation. */ +static void linenoiseEditBackspace(struct linenoiseState * l) { + if (l->pos > 0 && l->len > 0) { + memmove(l->buf+l->pos-1,l->buf+l->pos,l->len-l->pos); + l->pos--; + l->len--; + l->buf[l->len] = '\0'; + refreshLine(l); + } +} + +/* Delete the previosu word, maintaining the cursor at the start of the + * current word. */ +static void linenoiseEditDeletePrevWord(struct linenoiseState * l) { + size_t old_pos = l->pos; + size_t diff; + + while (l->pos > 0 && l->buf[l->pos-1] == ' ') + l->pos--; + while (l->pos > 0 && l->buf[l->pos-1] != ' ') + l->pos--; + diff = old_pos - l->pos; + memmove(l->buf+l->pos,l->buf+old_pos,l->len-old_pos+1); + l->len -= diff; + refreshLine(l); +} + +/* This function is part of the multiplexed API of Linenoise, that is used + * in order to implement the blocking variant of the API but can also be + * called by the user directly in an event driven program. It will: + * + * 1. Initialize the linenoise state passed by the user. + * 2. Put the terminal in RAW mode. + * 3. Show the prompt. + * 4. Return control to the user, that will have to call linenoiseEditFeed() + * each time there is some data arriving in the standard input. + * + * The user can also call linenoiseEditHide() and linenoiseEditShow() if it + * is required to show some input arriving asyncronously, without mixing + * it with the currently edited line. + * + * When linenoiseEditFeed() returns non-NULL, the user finished with the + * line editing session (pressed enter CTRL-D/C): in this case the caller + * needs to call linenoiseEditStop() to put back the terminal in normal + * mode. This will not destroy the buffer, as long as the linenoiseState + * is still valid in the context of the caller. + * + * The function returns 0 on success, or -1 if writing to standard output + * fails. If stdin_fd or stdout_fd are set to -1, the default is to use + * STDIN_FILENO and STDOUT_FILENO. + */ +int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) { + /* Populate the linenoise state that we pass to functions implementing + * specific editing functionalities. */ + l->in_completion = 0; + l->ifd = stdin_fd != -1 ? stdin_fd : STDIN_FILENO; + l->ofd = stdout_fd != -1 ? stdout_fd : STDOUT_FILENO; + l->buf = buf; + l->buflen = buflen; + l->prompt = prompt; + l->plen = strlen(prompt); + l->oldpos = l->pos = 0; + l->len = 0; + + /* Enter raw mode. */ + if (enableRawMode(l->ifd) == -1) return -1; + + l->cols = getColumns(stdin_fd, stdout_fd); + l->oldrows = 0; + l->history_index = 0; + + /* Buffer starts empty. */ + l->buf[0] = '\0'; + l->buflen--; /* Make sure there is always space for the nulterm */ + + /* If stdin is not a tty, stop here with the initialization. We + * will actually just read a line from standard input in blocking + * mode later, in linenoiseEditFeed(). */ + if (!isatty(l->ifd)) return 0; + + /* The latest history entry is always our current buffer, that + * initially is just an empty string. */ + linenoiseHistoryAdd(""); + + if (write(l->ofd,prompt,l->plen) == -1) return -1; + return 0; +} + +const char* linenoiseEditMore = "If you see this, you are misusing the API: when linenoiseEditFeed() is called, if it returns linenoiseEditMore the user is yet editing the line. See the README file for more information."; + +/* This function is part of the multiplexed API of linenoise, see the top + * comment on linenoiseEditStart() for more information. Call this function + * each time there is some data to read from the standard input file + * descriptor. In the case of blocking operations, this function can just be + * called in a loop, and block. + * + * The function returns linenoiseEditMore to signal that line editing is still + * in progress, that is, the user didn't yet pressed enter / CTRL-D. Otherwise + * the function returns the pointer to the heap-allocated buffer with the + * edited line, that the user should free with linenoiseFree(). + * + * On special conditions, NULL is returned and errno is populated: + * + * EAGAIN if the user pressed Ctrl-C + * ENOENT if the user pressed Ctrl-D + * + * Some other errno: I/O error. + */ +const char *linenoiseEditFeed(struct linenoiseState *l) { + /* Not a TTY, pass control to line reading without character + * count limits. */ + if (!isatty(l->ifd)) return linenoiseNoTTY(); + + char c; + int nread; + char seq[3]; + + nread = read(l->ifd,&c,1); + if (nread <= 0) return NULL; + + /* Only autocomplete when the callback is set. It returns < 0 when + * there was an error reading from fd. Otherwise it will return the + * character that should be handled next. */ + if ((l->in_completion || c == 9) && completionCallback != NULL) { + c = completeLine(l,c); + /* Read next character when 0 */ + if (c == 0) return linenoiseEditMore; + } + + switch(c) { + case ENTER: /* enter */ + history_len--; + free(history[history_len]); + if (mlmode) linenoiseEditMoveEnd(l); + if (hintsCallback) { + /* Force a refresh without hints to leave the previous + * line as the user typed it after a newline. */ + linenoiseHintsCallback *hc = hintsCallback; + hintsCallback = NULL; + refreshLine(l); + hintsCallback = hc; + } + return strdup(l->buf); + case CTRL_C: /* ctrl-c */ + errno = EAGAIN; + return NULL; + case BACKSPACE: /* backspace */ + case 8: /* ctrl-h */ + linenoiseEditBackspace(l); + break; + case CTRL_D: /* ctrl-d, remove char at right of cursor, or if the + line is empty, act as end-of-file. */ + if (l->len > 0) { + linenoiseEditDelete(l); + } else { + history_len--; + free(history[history_len]); + errno = ENOENT; + return NULL; + } + break; + case CTRL_T: /* ctrl-t, swaps current character with previous. */ + if (l->pos > 0 && l->pos < l->len) { + int aux = l->buf[l->pos-1]; + l->buf[l->pos-1] = l->buf[l->pos]; + l->buf[l->pos] = aux; + if (l->pos != l->len-1) l->pos++; + refreshLine(l); + } + break; + case CTRL_B: /* ctrl-b */ + linenoiseEditMoveLeft(l); + break; + case CTRL_F: /* ctrl-f */ + linenoiseEditMoveRight(l); + break; + case CTRL_P: /* ctrl-p */ + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); + break; + case CTRL_N: /* ctrl-n */ + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); + break; + case ESC: /* escape sequence */ + /* Read the next two bytes representing the escape sequence. + * Use two calls to handle slow terminals returning the two + * chars at different times. */ + if (read(l->ifd,seq,1) == -1) break; + if (read(l->ifd,seq+1,1) == -1) break; + + /* ESC [ sequences. */ + if (seq[0] == '[') { + if (seq[1] >= '0' && seq[1] <= '9') { + /* Extended escape, read additional byte. */ + if (read(l->ifd,seq+2,1) == -1) break; + if (seq[2] == '~') { + switch(seq[1]) { + case '3': /* Delete key. */ + linenoiseEditDelete(l); + break; + } + } + } else { + switch(seq[1]) { + case 'A': /* Up */ + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); + break; + case 'B': /* Down */ + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); + break; + case 'C': /* Right */ + linenoiseEditMoveRight(l); + break; + case 'D': /* Left */ + linenoiseEditMoveLeft(l); + break; + case 'H': /* Home */ + linenoiseEditMoveHome(l); + break; + case 'F': /* End*/ + linenoiseEditMoveEnd(l); + break; + } + } + } + + /* ESC O sequences. */ + else if (seq[0] == 'O') { + switch(seq[1]) { + case 'H': /* Home */ + linenoiseEditMoveHome(l); + break; + case 'F': /* End*/ + linenoiseEditMoveEnd(l); + break; + } + } + break; + default: + if (linenoiseEditInsert(l,c)) return NULL; + break; + case CTRL_U: /* Ctrl+u, delete the whole line. */ + l->buf[0] = '\0'; + l->pos = l->len = 0; + refreshLine(l); + break; + case CTRL_K: /* Ctrl+k, delete from current to end of line. */ + l->buf[l->pos] = '\0'; + l->len = l->pos; + refreshLine(l); + break; + case CTRL_A: /* Ctrl+a, go to the start of the line */ + linenoiseEditMoveHome(l); + break; + case CTRL_E: /* ctrl+e, go to the end of the line */ + linenoiseEditMoveEnd(l); + break; + case CTRL_L: /* ctrl+l, clear screen */ + linenoiseClearScreen(); + refreshLine(l); + break; + case CTRL_W: /* ctrl+w, delete previous word */ + linenoiseEditDeletePrevWord(l); + break; + } + return linenoiseEditMore; +} + +/* This is part of the multiplexed linenoise API. See linenoiseEditStart() + * for more information. This function is called when linenoiseEditFeed() + * returns something different than NULL. At this point the user input + * is in the buffer, and we can restore the terminal in normal mode. */ +void linenoiseEditStop(struct linenoiseState *l) { + if (!isatty(l->ifd)) return; + disableRawMode(l->ifd); + printf("\n"); +} + +/* This just implements a blocking loop for the multiplexed API. + * In many applications that are not event-drivern, we can just call + * the blocking linenoise API, wait for the user to complete the editing + * and return the buffer. */ +static const char *linenoiseBlockingEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) +{ + struct linenoiseState l; + + /* Editing without a buffer is invalid. */ + if (buflen == 0) { + errno = EINVAL; + return NULL; + } + + linenoiseEditStart(&l,stdin_fd,stdout_fd,buf,buflen,prompt); + const char *res; + while((res = linenoiseEditFeed(&l)) == linenoiseEditMore); + linenoiseEditStop(&l); + return res; +} + +/* This special mode is used by linenoise in order to print scan codes + * on screen for debugging / development purposes. It is implemented + * by the linenoise_example program using the --keycodes option. */ +void linenoisePrintKeyCodes(void) { + char quit[4]; + + printf("Linenoise key codes debugging mode.\n" + "Press keys to see scan codes. Type 'quit' at any time to exit.\n"); + if (enableRawMode(STDIN_FILENO) == -1) return; + memset(quit,' ',4); + while(1) { + char c; + int nread; + + nread = read(STDIN_FILENO,&c,1); + if (nread <= 0) continue; + memmove(quit,quit+1,sizeof(quit)-1); /* shift string to left. */ + quit[sizeof(quit)-1] = c; /* Insert current char on the right. */ + if (memcmp(quit,"quit",sizeof(quit)) == 0) break; + + printf("'%c' %02x (%d) (type quit to exit)\n", + isprint(c) ? c : '?', (int)c, (int)c); + printf("\r"); /* Go left edge manually, we are in raw mode. */ + fflush(stdout); + } + disableRawMode(STDIN_FILENO); +} + +/* This function is called when linenoise() is called with the standard + * input file descriptor not attached to a TTY. So for example when the + * program using linenoise is called in pipe or with a file redirected + * to its standard input. In this case, we want to be able to return the + * line regardless of its length (by default we are limited to 4k). */ +static char *linenoiseNoTTY(void) { + char *line = NULL; + size_t len = 0, maxlen = 0; + + while(1) { + if (len == maxlen) { + if (maxlen == 0) maxlen = 16; + maxlen *= 2; + char *oldval = line; + line = (char*) realloc(line,maxlen); + if (line == NULL) { + if (oldval) free(oldval); + return NULL; + } + } + int c = fgetc(stdin); + if (c == EOF || c == '\n') { + if (c == EOF && len == 0) { + free(line); + return NULL; + } else { + line[len] = '\0'; + return line; + } + } else { + line[len] = c; + len++; + } + } +} + +/* The high level function that is the main API of the linenoise library. + * This function checks if the terminal has basic capabilities, just checking + * for a blacklist of stupid terminals, and later either calls the line + * editing function or uses dummy fgets() so that you will be able to type + * something even in the most desperate of the conditions. */ +const char *linenoise(const char *prompt) { + char buf[LINENOISE_MAX_LINE]; + + if (!isatty(STDIN_FILENO)) { + /* Not a tty: read from file / pipe. In this mode we don't want any + * limit to the line size, so we call a function to handle that. */ + return linenoiseNoTTY(); + } else if (isUnsupportedTerm()) { + size_t len; + + printf("%s",prompt); + fflush(stdout); + if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL; + len = strlen(buf); + while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) { + len--; + buf[len] = '\0'; + } + return strdup(buf); + } else { + const char *retval = linenoiseBlockingEdit(STDIN_FILENO,STDOUT_FILENO,buf,LINENOISE_MAX_LINE,prompt); + return retval; + } +} + +/* This is just a wrapper the user may want to call in order to make sure + * the linenoise returned buffer is freed with the same allocator it was + * created with. Useful when the main program is using an alternative + * allocator. */ +void linenoiseFree(void *ptr) { + if (ptr == linenoiseEditMore) return; // Protect from API misuse. + free(ptr); +} + +/* ================================ History ================================= */ + +/* Free the history, but does not reset it. Only used when we have to + * exit() to avoid memory leaks are reported by valgrind & co. */ +static void freeHistory(void) { + if (history) { + int j; + + for (j = 0; j < history_len; j++) + free(history[j]); + free(history); + } +} + +/* At exit we'll try to fix the terminal to the initial conditions. */ +static void linenoiseAtExit(void) { + disableRawMode(STDIN_FILENO); + freeHistory(); +} + +/* This is the API call to add a new entry in the linenoise history. + * It uses a fixed array of char pointers that are shifted (memmoved) + * when the history max length is reached in order to remove the older + * entry and make room for the new one, so it is not exactly suitable for huge + * histories, but will work well for a few hundred of entries. + * + * Using a circular buffer is smarter, but a bit more complex to handle. */ +int linenoiseHistoryAdd(const char *line) { + char *linecopy; + + if (history_max_len == 0) return 0; + + /* Initialization on first call. */ + if (history == NULL) { + history = (char**) malloc(sizeof(char*)*history_max_len); + if (history == NULL) return 0; + memset(history,0,(sizeof(char*)*history_max_len)); + } + + /* Don't add duplicated lines. */ + if (history_len && !strcmp(history[history_len-1], line)) return 0; + + /* Add an heap allocated copy of the line in the history. + * If we reached the max length, remove the older line. */ + linecopy = strdup(line); + if (!linecopy) return 0; + if (history_len == history_max_len) { + free(history[0]); + memmove(history,history+1,sizeof(char*)*(history_max_len-1)); + history_len--; + } + history[history_len] = linecopy; + history_len++; + return 1; +} + +/* Set the maximum length for the history. This function can be called even + * if there is already some history, the function will make sure to retain + * just the latest 'len' elements if the new history length value is smaller + * than the amount of items already inside the history. */ +int linenoiseHistorySetMaxLen(int len) { + char **new_ptr; + + if (len < 1) return 0; + if (history) { + int tocopy = history_len; + + new_ptr = (char**) malloc(sizeof(char*)*len); + if (new_ptr == NULL) return 0; + + /* If we can't copy everything, free the elements we'll not use. */ + if (len < tocopy) { + int j; + + for (j = 0; j < tocopy-len; j++) free(history[j]); + tocopy = len; + } + memset(new_ptr,0,sizeof(char*)*len); + memcpy(new_ptr,history+(history_len-tocopy), sizeof(char*)*tocopy); + free(history); + history = new_ptr; + } + history_max_len = len; + if (history_len > history_max_len) + history_len = history_max_len; + return 1; +} + +/* Save the history in the specified file. On success 0 is returned + * otherwise -1 is returned. */ +int linenoiseHistorySave(const char *filename) { + mode_t old_umask = umask(S_IXUSR|S_IRWXG|S_IRWXO); + FILE *fp; + int j; + + fp = fopen(filename,"w"); + umask(old_umask); + if (fp == NULL) return -1; + chmod(filename,S_IRUSR|S_IWUSR); + for (j = 0; j < history_len; j++) + fprintf(fp,"%s\n",history[j]); + fclose(fp); + return 0; +} + +/* Load the history from the specified file. If the file does not exist + * zero is returned and no operation is performed. + * + * If the file exists and the operation succeeded 0 is returned, otherwise + * on error -1 is returned. */ +int linenoiseHistoryLoad(const char *filename) { + FILE *fp = fopen(filename,"r"); + char buf[LINENOISE_MAX_LINE]; + + if (fp == NULL) return -1; + + while (fgets(buf,LINENOISE_MAX_LINE,fp) != NULL) { + char *p; + + p = strchr(buf,'\r'); + if (!p) p = strchr(buf,'\n'); + if (p) *p = '\0'; + linenoiseHistoryAdd(buf); + } + fclose(fp); + return 0; +} +#endif diff --git a/examples/run/linenoise.cpp/linenoise.h b/examples/run/linenoise.cpp/linenoise.h new file mode 100644 index 000000000..3e25f4de3 --- /dev/null +++ b/examples/run/linenoise.cpp/linenoise.h @@ -0,0 +1,114 @@ +/* linenoise.h -- VERSION 1.0 + * + * Guerrilla line editing library against the idea that a line editing lib + * needs to be 20,000 lines of C++ code. + * + * See linenoise.cpp for more information. + * + * ------------------------------------------------------------------------ + * + * Copyright (c) 2010-2023, Salvatore Sanfilippo + * Copyright (c) 2010-2013, Pieter Noordhuis + * Copyright (c) 2025, Eric Curtin + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __LINENOISE_H +#define __LINENOISE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include /* For size_t. */ + +extern const char *linenoiseEditMore; + +/* The linenoiseState structure represents the state during line editing. + * We pass this state to functions implementing specific editing + * functionalities. */ +struct linenoiseState { + int in_completion; /* The user pressed TAB and we are now in completion + * mode, so input is handled by completeLine(). */ + size_t completion_idx; /* Index of next completion to propose. */ + int ifd; /* Terminal stdin file descriptor. */ + int ofd; /* Terminal stdout file descriptor. */ + char *buf; /* Edited line buffer. */ + size_t buflen; /* Edited line buffer size. */ + const char *prompt; /* Prompt to display. */ + size_t plen; /* Prompt length. */ + size_t pos; /* Current cursor position. */ + size_t oldpos; /* Previous refresh cursor position. */ + size_t len; /* Current edited line length. */ + size_t cols; /* Number of columns in terminal. */ + size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */ + int history_index; /* The history index we are currently editing. */ +}; + +typedef struct linenoiseCompletions { + size_t len; + char **cvec; +} linenoiseCompletions; + +/* Non blocking API. */ +int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt); +const char *linenoiseEditFeed(struct linenoiseState *l); +void linenoiseEditStop(struct linenoiseState *l); +void linenoiseHide(struct linenoiseState *l); +void linenoiseShow(struct linenoiseState *l); + +/* Blocking API. */ +const char *linenoise(const char *prompt); +void linenoiseFree(void *ptr); + +/* Completion API. */ +typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *); +typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold); +typedef void(linenoiseFreeHintsCallback)(const char *); +void linenoiseSetCompletionCallback(linenoiseCompletionCallback *); +void linenoiseSetHintsCallback(linenoiseHintsCallback *); +void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *); +void linenoiseAddCompletion(linenoiseCompletions *, const char *); + +/* History API. */ +int linenoiseHistoryAdd(const char *line); +int linenoiseHistorySetMaxLen(int len); +int linenoiseHistorySave(const char *filename); +int linenoiseHistoryLoad(const char *filename); + +/* Other utilities. */ +void linenoiseClearScreen(void); +void linenoiseSetMultiLine(int ml); +void linenoisePrintKeyCodes(void); +void linenoiseMaskModeEnable(void); +void linenoiseMaskModeDisable(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __LINENOISE_H */ diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 0ad8bb15b..dd9ea79e8 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -19,12 +19,14 @@ #include #include #include +#include #include #include #include #include "common.h" #include "json.hpp" +#include "linenoise.cpp/linenoise.h" #include "llama-cpp.h" #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) @@ -536,7 +538,7 @@ class LlamaData { llama_sampler_ptr sampler; llama_context_ptr context; std::vector messages; - std::vector msg_strs; + std::list msg_strs; std::vector fmtted; int init(Opt & opt) { @@ -807,24 +809,44 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str batch = llama_batch_get_one(&new_token_id, 1); } + printf("\033[0m"); return 0; } -static int read_user_input(std::string & user) { - std::getline(std::cin, user); +static int read_user_input(std::string & user_input) { + static const char * prompt_prefix = "> "; +#ifdef WIN32 + printf( + "\r%*s" + "\r\033[0m%s", + get_terminal_width(), " ", prompt_prefix); + + std::getline(std::cin, user_input); if (std::cin.eof()) { printf("\n"); return 1; } - - if (user == "/bye") { +#else + std::unique_ptr line(const_cast(linenoise(prompt_prefix)), free); + if (!line) { return 1; } - if (user.empty()) { + user_input = line.get(); +#endif + + if (user_input == "/bye") { + return 1; + } + + if (user_input.empty()) { return 2; } +#ifndef WIN32 + linenoiseHistoryAdd(line.get()); +#endif + return 0; // Should have data in happy path } @@ -865,10 +887,6 @@ static int handle_user_input(std::string & user_input, const std::string & user) return 0; // No need for interactive input } - printf( - "\r%*s" - "\r\033[32m> \033[0m", - get_terminal_width(), " "); return read_user_input(user_input); // Returns true if input ends the loop } From 99487b57d47e14dc342b7b89d238ca11c0345241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= Date: Sun, 19 Jan 2025 14:33:34 +0100 Subject: [PATCH 26/30] SYCL: Introducing memory host pool (#11251) * Implement host pool for matrix_info Creating a new memory pool on the host to store memory location for matrix_info needed to launch gemm_batch from oneMKL/oneMath. Removing complex support in gemm_batch since it is not used in llama.cpp * Remove unnecessary headers and cast * Reorder member variable to avoid warning on initialization * Formatting * Remove unused variable * Address PR review feedback - remove warning --------- Signed-off-by: nscipione --- ggml/src/ggml-sycl/common.hpp | 13 +++ ggml/src/ggml-sycl/dpct/helper.hpp | 135 +++++++++-------------------- ggml/src/ggml-sycl/ggml-sycl.cpp | 92 ++++++++++++++++++-- 3 files changed, 137 insertions(+), 103 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index e9500f3a1..abad847ca 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -333,8 +333,12 @@ struct ggml_backend_sycl_context { // pool std::unique_ptr pools[GGML_SYCL_MAX_DEVICES]; + std::unique_ptr host_pools[GGML_SYCL_MAX_DEVICES]; + static std::unique_ptr new_pool_for_device(queue_ptr qptr, int device); + static std::unique_ptr new_pool_for_host(queue_ptr qptr, int device); + ggml_sycl_pool & pool(int device) { if (pools[device] == nullptr) { pools[device] = new_pool_for_device(stream(device,0), device); @@ -345,6 +349,15 @@ struct ggml_backend_sycl_context { ggml_sycl_pool & pool() { return pool(device); } + + ggml_sycl_pool & host_pool(int device) { + if (host_pools[device] == nullptr) { + host_pools[device] = new_pool_for_host(stream(device, 0), device); + } + return *host_pools[device]; + } + + ggml_sycl_pool & host_pool() { return host_pool(device); } }; // common device functions diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index e167948e7..c96395be6 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -82,6 +82,14 @@ inline std::string get_device_backend_and_type(const sycl::device &device) { return device_type.str(); } +template struct matrix_info_t { + oneapi::mkl::transpose transpose_info[2]; + Ts value_info[2]; + std::int64_t size_info[3]; + std::int64_t ld_info[3]; + std::int64_t groupsize_info; +}; + namespace dpct { typedef sycl::queue *queue_ptr; @@ -1727,26 +1735,13 @@ namespace dpct }; template - inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void **a, int lda, - const void **b, int ldb, const void *beta, void **c, - int ldc, int batch_size) - { - struct matrix_info_t - { - oneapi::mkl::transpose transpose_info[2]; - Ts value_info[2]; - std::int64_t size_info[3]; - std::int64_t ld_info[3]; - std::int64_t groupsize_info; - }; - + inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, + int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b, + int ldb, const void * beta, void ** c, int ldc, int batch_size, + matrix_info_t * matrix_info) { Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); - matrix_info_t *matrix_info = - (matrix_info_t *)std::malloc(sizeof(matrix_info_t)); matrix_info->transpose_info[0] = a_trans; matrix_info->transpose_info[1] = b_trans; matrix_info->value_info[0] = alpha_value; @@ -1763,23 +1758,18 @@ namespace dpct sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( oneapi::mkl::backend_selector{ q }, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, - matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast(a), - matrix_info->ld_info, reinterpret_cast(b), matrix_info->ld_info + 1, - matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, - &(matrix_info->groupsize_info)); + matrix_info->size_info + 2, reinterpret_cast(matrix_info->value_info), + reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), + matrix_info->ld_info + 1, reinterpret_cast(matrix_info->value_info + 1), + reinterpret_cast(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); #else sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, - matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info, + matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast(matrix_info->value_info), reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), - matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast(c), - matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); + matrix_info->ld_info + 1, reinterpret_cast(matrix_info->value_info + 1), + reinterpret_cast(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); #endif - - q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(e); - cgh.host_task([=] { std::free(matrix_info); }); }); } template @@ -2422,25 +2412,11 @@ namespace dpct /// \param [in] ldc Leading dimension of C. /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void *a[], - library_data_t a_type, int lda, const void *b[], - library_data_t b_type, int ldb, const void *beta, - void *c[], library_data_t c_type, int ldc, - int batch_size, library_data_t scaling_type) - { - if (scaling_type == library_data_t::real_float && - c_type == library_data_t::complex_float) - { - scaling_type = library_data_t::complex_float; - } - else if (scaling_type == library_data_t::real_double && - c_type == library_data_t::complex_double) - { - scaling_type = library_data_t::complex_double; - } - + inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, + int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda, + const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[], + library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type, + matrix_info_t * matrix_info) { std::uint64_t key = detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); switch (key) @@ -2449,48 +2425,24 @@ namespace dpct library_data_t::real_float, library_data_t::real_float, library_data_t::real_float, library_data_t::real_float): { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, batch_size, matrix_info); break; } case detail::get_type_combination_id( library_data_t::real_double, library_data_t::real_double, library_data_t::real_double, library_data_t::real_double): { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_float, library_data_t::complex_float, - library_data_t::complex_float, library_data_t::complex_float): - { - detail::gemm_batch_impl, std::complex, - std::complex, std::complex>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_double, library_data_t::complex_double, - library_data_t::complex_double, library_data_t::complex_double): - { - detail::gemm_batch_impl, std::complex, - std::complex, std::complex>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, batch_size, matrix_info); break; } case detail::get_type_combination_id( library_data_t::real_half, library_data_t::real_half, library_data_t::real_half, library_data_t::real_half): { - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, - a, lda, b, ldb, beta, c, ldc, - batch_size); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } #ifdef __INTEL_MKL__ @@ -2498,19 +2450,16 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float): { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } case detail::get_type_combination_id( library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float, library_data_t::real_float): { - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, - b, ldb, beta, c, ldc, batch_size); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } #endif @@ -2522,10 +2471,9 @@ namespace dpct dpct::get_value(reinterpret_cast(alpha), q); float beta_float = dpct::get_value(reinterpret_cast(beta), q); - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, &alpha_float, - a, lda, b, ldb, &beta_float, c, ldc, - batch_size); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size, + matrix_info); break; } case detail::get_type_combination_id( @@ -2533,8 +2481,7 @@ namespace dpct library_data_t::real_float, library_data_t::real_float): { detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } case detail::get_type_combination_id( @@ -2542,8 +2489,7 @@ namespace dpct library_data_t::real_float, library_data_t::real_float): { detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } case detail::get_type_combination_id( @@ -2557,8 +2503,7 @@ namespace dpct sycl::half alpha_half(alpha_value); sycl::half beta_half(beta_value); detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, - batch_size); + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info); break; } default: diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 5272ca454..ed4d8bb8b 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1173,6 +1173,85 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } }; +struct ggml_sycl_pool_host : public ggml_sycl_pool { + queue_ptr qptr; + int device; + + inline static int counter{ 0 }; + + struct ggml_sycl_buffer { + void * ptr = nullptr; + size_t size = 0; + }; + + // Set arbitrarly to 64 + static constexpr int MAX_POOL_SIZE{ 64 }; + std::vector buffer_pool = std::vector(MAX_POOL_SIZE); + size_t pool_size = 0; + + explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {} + + ~ggml_sycl_pool_host() { + for (int i = 0; i < MAX_POOL_SIZE; ++i) { + ggml_sycl_buffer & b = buffer_pool[i]; + if (b.ptr != nullptr) { + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); + b.ptr = nullptr; + pool_size -= b.size; + b.size = 0; + } + } + counter = 0; + } + + void * alloc(size_t size, size_t * actual_size) override { + if (counter == MAX_POOL_SIZE) { + ggml_sycl_buffer b = buffer_pool[0]; + void * ptr = b.ptr; + *actual_size = b.size; + counter = 1; + return ptr; + } + ggml_sycl_buffer & b = buffer_pool[counter]; + + if (b.ptr == nullptr) { + void * ptr; + + SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr))); + if (!ptr) { + GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size); + return nullptr; + } + pool_size += size; + *actual_size = size; + counter = counter + 1; + return ptr; + } else { + ++counter; + b.size = size; + return b.ptr; + } + } + + void free(void * ptr, size_t size) override { + // if the pool is not completed add the pointer to it in place of the first nullptr found. + // Otherwise do nothing, pointers will be freed once the pool is deallocated. + for (int i = 0; i < MAX_POOL_SIZE; ++i) { + ggml_sycl_buffer & b = buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + } +}; + +std::unique_ptr ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) { + // return pool for the host to speed up memory management + return std::unique_ptr(new ggml_sycl_pool_host(qptr, device)); +} + std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { // TBD: NO VMM support // if (ggml_sycl_info().devices[device].vmm) { @@ -3363,6 +3442,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, ggml_sycl_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); + ggml_sycl_pool_alloc> matrix_info(ctx.host_pool(), 1); sycl::range<3> block_dims(1, ne12, ne13); /* @@ -3391,14 +3471,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, }); } SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( - *main_stream, oneapi::mkl::transpose::trans, - oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, - (const void **)(ptrs_src.get() + 0 * ne23), - dpct::library_data_t::real_half, nb01 / nb00, - (const void **)(ptrs_src.get() + 1 * ne23), - dpct::library_data_t::real_half, nb11 / nb10, beta, - (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, - cu_compute_type))); + *main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, + (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00, + (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta, + (void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get()))); } } catch (sycl::exception const &exc) { From b9daaffe02d6a77d85f0420bce5dfe0e00daeff6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Jan 2025 18:12:09 +0200 Subject: [PATCH 27/30] simple-chat : fix BOS being added to each message (#11278) --- examples/simple-chat/simple-chat.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index e8eda9c22..26422601d 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -95,11 +95,11 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); // helper function to evaluate a prompt and generate a response - auto generate = [&](const std::string & prompt) { + auto generate = [&](const std::string & prompt, bool is_first) { std::string response; // tokenize the prompt - const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true); + const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); std::vector prompt_tokens(n_prompt_tokens); if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) { GGML_ABORT("failed to tokenize the prompt\n"); @@ -180,7 +180,7 @@ int main(int argc, char ** argv) { // generate a response printf("\033[33m"); - std::string response = generate(prompt); + std::string response = generate(prompt, prev_len == 0); printf("\n\033[0m"); // add the response to the messages From 92bc493917d43b83e592349e138b54c90b1c3ea7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 19 Jan 2025 20:22:30 +0200 Subject: [PATCH 28/30] tests : increase timeout when sanitizers are enabled (#11300) * tests : increase timeout when sanitizers are enabled * tests : add DEFAULT_HTTP_TIMEOUT --- .github/workflows/server.yml | 25 ++++++++++++++++++++++--- examples/server/tests/utils.py | 5 ++++- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 671fe595c..ed1c357a5 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -112,9 +112,9 @@ jobs: -DGGML_OPENMP=OFF ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} + - name: Build (sanitizers) + id: cmake_build_sanitizers + if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ -DGGML_NATIVE=OFF \ @@ -124,12 +124,31 @@ jobs: -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + - name: Build (sanitizers) + id: cmake_build + if: ${{ matrix.sanitizer == '' }} + run: | + cmake -B build \ + -DGGML_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + - name: Tests id: server_integration_tests + if: ${{ matrix.sanitizer == '' }} run: | cd examples/server/tests ./tests.sh + - name: Tests (sanitizers) + id: server_integration_tests_sanitizers + if: ${{ matrix.sanitizer != '' }} + run: | + cd examples/server/tests + LLAMA_SANITIZE=1 ./tests.sh + - name: Slow tests id: server_integration_tests_slow if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 73be4c92f..9d1a7a5b0 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -26,6 +26,9 @@ from re import RegexFlag import wget +DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30 + + class ServerResponse: headers: dict status_code: int @@ -88,7 +91,7 @@ class ServerProcess: if "PORT" in os.environ: self.server_port = int(os.environ["PORT"]) - def start(self, timeout_seconds: int = 10) -> None: + def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: if "LLAMA_SERVER_BIN_PATH" in os.environ: server_path = os.environ["LLAMA_SERVER_BIN_PATH"] elif os.name == "nt": From ae3c1db2f9c4beec0737c6a82d1f3791fd6fcdb2 Mon Sep 17 00:00:00 2001 From: Kyle Bruene Date: Mon, 20 Jan 2025 01:21:01 -0600 Subject: [PATCH 29/30] llama : re-add LLM_ARCH_PHIMOE (#11305) Phi 3.5 MoE was partially removed during a refactor. The code was originally in llama.cpp and should be in llama-model.cpp after the refactor. --- src/llama-model.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2d23a8d3..6dfcd5f59 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2203,6 +2203,50 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); } } break; + case LLM_ARCH_PHIMOE: + { + const int64_t n_embd_head = n_embd / n_head; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0); + output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0); + + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED); + if (layer.wqkv == nullptr) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0); + + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0); + + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + } + } break; case LLM_ARCH_PLAMO: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); From ef6dada60ca710f4edfbb6fd9e1258685d8ea49d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 20 Jan 2025 09:29:32 +0200 Subject: [PATCH 30/30] cont : fix whitespaces (#11305) --- src/llama-model.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6dfcd5f59..590386e64 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2206,43 +2206,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_PHIMOE: { const int64_t n_embd_head = n_embd / n_head; - + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); - + // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0); output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0); - + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0); - + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.wqkv == nullptr) { layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0); - + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0); - + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0); } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0); - + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0); - + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); - + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); }