From 27ab66e437797aedbb23b3599385756b6c26ac39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Podiv=C3=ADn?= <66251151+jpodivin@users.noreply.github.com> Date: Sun, 16 Jul 2023 21:54:47 +0200 Subject: [PATCH 01/16] py : turn verify-checksum-models.py into executable (#2245) README.md was adjusted to reflect the change. Signed-off-by: Jiri Podivin --- README.md | 2 +- scripts/verify-checksum-models.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 scripts/verify-checksum-models.py diff --git a/README.md b/README.md index 476cc438b..f45e4bf08 100644 --- a/README.md +++ b/README.md @@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t ```bash # run the verification script -python3 .\scripts\verify-checksum-models.py +./scripts/verify-checksum-models.py ``` - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py old mode 100644 new mode 100755 index d12748281..307b7c08d --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,3 +1,5 @@ +#!/bin/env python3 + import os import hashlib From 672dda10e4d8ac79df5d5970da7fb69d242ca9a7 Mon Sep 17 00:00:00 2001 From: Qingyou Meng Date: Mon, 17 Jul 2023 03:57:28 +0800 Subject: [PATCH 02/16] ggml : fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG (#2219) * fixed runtime bugs and compile errors related to GGML_PERF and GGML_DEBUG * remove ifdef GGML_PERF; update fmt --- ggml.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/ggml.c b/ggml.c index 5ce1da0e9..c56a3d0e0 100644 --- a/ggml.c +++ b/ggml.c @@ -4412,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) { if (&g_state.contexts[i].context == ctx) { g_state.contexts[i].used = false; - GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", - __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); + GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", + __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { GGML_ALIGNED_FREE(ctx->mem_buffer); @@ -16317,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); } + ggml_graph_compute_perf_stats_node(node, state->shared); } // distribute new work or execute it direct if 1T @@ -16348,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.type = GGML_TASK_FINALIZE; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); } + + ggml_graph_compute_perf_stats_node(node, state->shared); } else { break; } @@ -16891,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char } void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { - //assert(cgraph->work == NULL); - //assert(cgraph->work_size == 0); - uint64_t size_eval = 0; // compute size of intermediate results @@ -17332,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); - GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; From b7647436ccc80970b44a270f70f4f2ea139054d1 Mon Sep 17 00:00:00 2001 From: Alex Klinkhamer Date: Sun, 16 Jul 2023 14:01:45 -0700 Subject: [PATCH 03/16] llama : fix t_start_sample_us initialization warning (#2238) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 27e1ee964..0f9d5346d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2205,7 +2205,7 @@ void llama_sample_classifier_free_guidance( struct llama_context * guidance_ctx, float scale, float smooth_factor) { - int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); + int64_t t_start_sample_us = ggml_time_us(); assert(ctx); auto n_vocab = llama_n_vocab(ctx); From 7568d1a2b206331412106ea66da3f871025e0c3c Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 18 Jul 2023 01:39:29 +0800 Subject: [PATCH 04/16] Support dup & cont ops on CUDA (#2242) --- ggml-cuda.cu | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0646fa7b2..d3054a7fa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3537,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens (void) dst; } +void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_cpy(src0, dst, nullptr); + (void) src1; +} + void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); @@ -3670,7 +3675,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo // recursively assign CUDA buffers until a compute tensor is found if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; - if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); } } @@ -3776,6 +3781,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); switch (tensor->op) { + case GGML_OP_DUP: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; case GGML_OP_ADD: if (!any_on_device) { return false; @@ -3830,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_cpy; break; + case GGML_OP_CONT: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: From 6cbf9dfb32f0e23ed3afd02d30ab066ed53e2c4d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 11:50:49 +0300 Subject: [PATCH 05/16] llama : shorten quantization descriptions --- examples/quantize/quantize.cpp | 114 ++++++--------------------------- 1 file changed, 19 insertions(+), 95 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 797d2f0c5..744f549c5 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -14,103 +14,27 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { - "Q4_0", - LLAMA_FTYPE_MOSTLY_Q4_0, - " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M", - }, - { - "Q4_1", - LLAMA_FTYPE_MOSTLY_Q4_1, - " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L", - }, - { - "Q5_0", - LLAMA_FTYPE_MOSTLY_Q5_0, - " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M", - }, - { - "Q5_1", - LLAMA_FTYPE_MOSTLY_Q5_1, - " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M", - }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", }, #ifdef GGML_USE_K_QUANTS - { - "Q2_K", - LLAMA_FTYPE_MOSTLY_Q2_K, - " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended", - }, - { - "Q3_K", - LLAMA_FTYPE_MOSTLY_Q3_K_M, - "alias for Q3_K_M" - }, - { - "Q3_K_S", - LLAMA_FTYPE_MOSTLY_Q3_K_S, - " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss", - }, - { - "Q3_K_M", - LLAMA_FTYPE_MOSTLY_Q3_K_M, - " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss", - }, - { - "Q3_K_L", - LLAMA_FTYPE_MOSTLY_Q3_K_L, - " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss", - }, - { - "Q4_K", - LLAMA_FTYPE_MOSTLY_Q4_K_M, - "alias for Q4_K_M", - }, - { - "Q4_K_S", - LLAMA_FTYPE_MOSTLY_Q4_K_S, - " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss", - }, - { - "Q4_K_M", - LLAMA_FTYPE_MOSTLY_Q4_K_M, - " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*", - }, - { - "Q5_K", - LLAMA_FTYPE_MOSTLY_Q5_K_M, - "alias for Q5_K_M", - }, - { - "Q5_K_S", - LLAMA_FTYPE_MOSTLY_Q5_K_S, - " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*", - }, - { - "Q5_K_M", - LLAMA_FTYPE_MOSTLY_Q5_K_M, - " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*", - }, - { - "Q6_K", - LLAMA_FTYPE_MOSTLY_Q6_K, - " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss", - }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", }, #endif - { - "Q8_0", - LLAMA_FTYPE_MOSTLY_Q8_0, - " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended", - }, - { - "F16", - LLAMA_FTYPE_MOSTLY_F16, - "13.00G @ 7B - extremely large, virtually no quality loss - not recommended", - }, - { - "F32", - LLAMA_FTYPE_ALL_F32, - "26.00G @ 7B - absolutely huge, lossless - not recommended", - }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, }; From d01bccde9f759b24449fdaa16306b406a50eb367 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 14:24:43 +0300 Subject: [PATCH 06/16] ci : integrate with ggml-org/ci (#2250) * ci : run ctest ggml-ci * ci : add open llama 3B-v2 tests ggml-ci * ci : disable wget progress output ggml-ci * ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations ggml-ci * tests : try to fix tail free sampling test ggml-ci * ci : add K-quants ggml-ci * ci : add short perplexity tests ggml-ci * ci : add README.md * ppl : add --chunks argument to limit max number of chunks ggml-ci * ci : update README --- .gitignore | 5 +- ci/README.md | 20 +++ ci/run.sh | 262 +++++++++++++++++++++++++++++ examples/common.cpp | 7 + examples/common.h | 1 + examples/perplexity/perplexity.cpp | 6 +- llama.cpp | 15 +- tests/test-sampling.cpp | 2 + 8 files changed, 312 insertions(+), 6 deletions(-) create mode 100644 ci/README.md create mode 100644 ci/run.sh diff --git a/.gitignore b/.gitignore index faec869e0..a23ac5928 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ build/ build-em/ build-debug/ build-release/ +build-ci-debug/ +build-ci-release/ build-static/ build-cublas/ build-opencl/ @@ -25,9 +27,10 @@ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ out/ +tmp/ models/* -*.bin +models-mnt /main /quantize diff --git a/ci/README.md b/ci/README.md new file mode 100644 index 000000000..6c74c8138 --- /dev/null +++ b/ci/README.md @@ -0,0 +1,20 @@ +# CI + +In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: + +https://github.com/ggml-org/ci + +It monitors the `master` branch for new commits and runs the +[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us +to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled +to cover various hardware architectures, including GPU and Apple Silicon instances. + +Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message. +Only the branches of this repo are monitored for this keyword. + +It is a good practice, before publishing changes to execute the full CI locally on your machine: + +```bash +mkdir tmp +bash ./ci/run.sh ./tmp/results ./tmp/mnt +``` diff --git a/ci/run.sh b/ci/run.sh new file mode 100644 index 000000000..c823bc467 --- /dev/null +++ b/ci/run.sh @@ -0,0 +1,262 @@ +#/bin/bash + +if [ -z "$2" ]; then + echo "usage: $0 " + exit 1 +fi + +mkdir -p "$1" +mkdir -p "$2" + +OUT=$(realpath "$1") +MNT=$(realpath "$2") + +rm -v $OUT/*.log +rm -v $OUT/*.exit +rm -v $OUT/*.md + +sd=`dirname $0` +cd $sd/../ +SRC=`pwd` + +## helpers + +# download a file if it does not exist or if it is outdated +function gg_wget { + local out=$1 + local url=$2 + + local cwd=`pwd` + + mkdir -p $out + cd $out + + # should not re-download if file is the same + wget -nv -N $url + + cd $cwd +} + +function gg_printf { + printf -- "$@" >> $OUT/README.md +} + +function gg_run { + ci=$1 + + set -o pipefail + set -x + + gg_run_$ci | tee $OUT/$ci.log + cur=$? + echo "$cur" > $OUT/$ci.exit + + set +x + set +o pipefail + + gg_sum_$ci + + ret=$((ret | cur)) +} + +## ci + +# ctest_debug + +function gg_run_ctest_debug { + cd ${SRC} + + rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + + set +e +} + +function gg_sum_ctest_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' + gg_printf '\n' +} + +# ctest_release + +function gg_run_ctest_release { + cd ${SRC} + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + if [ -z $GG_BUILD_LOW_PERF ]; then + (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + else + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + fi + + set +e +} + +function gg_sum_ctest_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + +# open_llama_3b_v2 + +function gg_run_open_llama_3b_v2 { + cd ${SRC} + + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json + + gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ + head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw + + path_models="../models-mnt/open-llama/3B-v2" + path_wiki="../models-mnt/wikitext/wikitext-2-raw" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert.py ${path_models} + + model_f16="${path_models}/ggml-model-f16.bin" + model_q8_0="${path_models}/ggml-model-q8_0.bin" + model_q4_0="${path_models}/ggml-model-q4_0.bin" + model_q4_1="${path_models}/ggml-model-q4_1.bin" + model_q5_0="${path_models}/ggml-model-q5_0.bin" + model_q5_1="${path_models}/ggml-model-q5_1.bin" + model_q3_k="${path_models}/ggml-model-q3_k.bin" + model_q4_k="${path_models}/ggml-model-q4_k.bin" + model_q5_k="${path_models}/ggml-model-q5_k.bin" + model_q6_k="${path_models}/ggml-model-q6_k.bin" + + wiki_test_60="${path_wiki}/wiki.test-60.raw" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/quantize ${model_f16} ${model_q3_k} q3_k + ./bin/quantize ${model_f16} ${model_q4_k} q4_k + ./bin/quantize ${model_f16} ${model_q5_k} q5_k + ./bin/quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + function check_ppl { + qnt="$1" + ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl" + return 20 + fi + + printf ' - %s @ %s OK\n' "$qnt" "$ppl" + return 0 + } + + check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + + set +e +} + +function gg_sum_open_llama_3b_v2 { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'OpenLLaMA 3B-v2:\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" + gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" + gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)" + gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)" + gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)" + gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)" + gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" + gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" + gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" +} + +## main + +if [ -z $GG_BUILD_LOW_PERF ]; then + rm -rf ${SRC}/models-mnt + + mnt_models=$(realpath ${MNT}/models) + mkdir -p ${mnt_models} + ln -sfn ${mnt_models} ${SRC}/models-mnt + + python3 -m pip install -r ${SRC}/requirements.txt +fi + +ret=0 + +#test $ret -eq 0 && gg_run ctest_debug +#test $ret -eq 0 && gg_run ctest_release + +if [ -z $GG_BUILD_LOW_PERF ]; then + test $ret -eq 0 && gg_run open_llama_3b_v2 +fi + +exit $ret diff --git a/examples/common.cpp b/examples/common.cpp index 8705127cb..fd6dbc0e3 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_keep = std::stoi(argv[i]); + } else if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_chunks = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } diff --git a/examples/common.h b/examples/common.h index f52fef629..037a4eecb 100644 --- a/examples/common.h +++ b/examples/common.h @@ -28,6 +28,7 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7e120ff12..bfad99939 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // BOS tokens will be added for each chunk before eval auto tokens = ::llama_tokenize(ctx, params.prompt, true); - int count = 0; + const int n_chunk_max = tokens.size() / params.n_ctx; - const int n_chunk = tokens.size() / params.n_ctx; + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_vocab = llama_n_vocab(ctx); const int n_batch = params.n_batch; + int count = 0; double nll = 0.0; + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); for (int i = 0; i < n_chunk; ++i) { diff --git a/llama.cpp b/llama.cpp index 0f9d5346d..fa3b7c03c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2024,9 +2024,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * } // Normalize the second derivatives - float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); - for (float & value : second_derivatives) { - value /= second_derivatives_sum; + { + const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + + if (second_derivatives_sum > 1e-6f) { + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + } else { + for (float & value : second_derivatives) { + value = 1.0f / second_derivatives.size(); + } + } } float cum_sum = 0.0f; diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 64f9455d7..4437c3948 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -200,4 +200,6 @@ int main(void) { test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f); printf("OK\n"); + + return 0; } From b1f429095328a34556c0e9a7a2fefced3db3368c Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:01:11 +0800 Subject: [PATCH 07/16] cmake : install targets (#2256) fix #2252 --- CMakeLists.txt | 25 +++++++++++++++++++ convert-lora-to-ggml.py | 1 + convert.py | 1 + examples/baby-llama/CMakeLists.txt | 1 + examples/benchmark/CMakeLists.txt | 1 + examples/embd-input/CMakeLists.txt | 2 ++ examples/embedding/CMakeLists.txt | 1 + examples/main/CMakeLists.txt | 1 + examples/metal/CMakeLists.txt | 1 + examples/perplexity/CMakeLists.txt | 1 + examples/quantize-stats/CMakeLists.txt | 1 + examples/quantize/CMakeLists.txt | 1 + examples/save-load-state/CMakeLists.txt | 1 + examples/server/CMakeLists.txt | 1 + examples/simple/CMakeLists.txt | 1 + .../train-text-from-scratch/CMakeLists.txt | 1 + tests/CMakeLists.txt | 1 + 17 files changed, 42 insertions(+) mode change 100644 => 100755 convert-lora-to-ggml.py mode change 100644 => 100755 convert.py diff --git a/CMakeLists.txt b/CMakeLists.txt index d9381dae1..abc96814d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -512,6 +512,7 @@ if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(ggml_shared SHARED $) target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) + install(TARGETS ggml_shared LIBRARY) endif() add_library(llama @@ -533,8 +534,32 @@ if (BUILD_SHARED_LIBS) if (LLAMA_METAL) set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") endif() + install(TARGETS llama LIBRARY) endif() +include(GNUInstallDirs) +install( + FILES convert.py + PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE + DESTINATION ${CMAKE_INSTALL_BINDIR}) +install( + FILES convert-lora-to-ggml.py + PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE + DESTINATION ${CMAKE_INSTALL_BINDIR}) # # programs, examples and tests diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py old mode 100644 new mode 100755 index f43c836f5..b4999ff5a --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import json import os import re diff --git a/convert.py b/convert.py old mode 100644 new mode 100755 index 7a2705e5c..e3f1096e1 --- a/convert.py +++ b/convert.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import argparse import concurrent.futures import copy diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index d2ce36367..7b70227a5 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET baby-llama) add_executable(${TARGET} baby-llama.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 037696194..3f3415350 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt index 2b623953e..5bbb1ea02 100644 --- a/examples/embd-input/CMakeLists.txt +++ b/examples/embd-input/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET embdinput) add_library(${TARGET} embd-input-lib.cpp embd-input.h) +install(TARGETS ${TARGET} LIBRARY) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) @@ -8,6 +9,7 @@ endif() set(TARGET embd-input-test) add_executable(${TARGET} embd-input-test.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index db73b6b44..0c752c7bb 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index c364242fb..cc1888948 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET main) add_executable(${TARGET} main.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt index a8c4284a5..f16d49165 100644 --- a/examples/metal/CMakeLists.txt +++ b/examples/metal/CMakeLists.txt @@ -1,3 +1,4 @@ set(TEST_TARGET metal) add_executable(${TEST_TARGET} metal.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE ggml) diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 61b17b828..af00b4e16 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index 7bebc11a1..c5c394058 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET quantize-stats) add_executable(${TARGET} quantize-stats.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 475fc8be8..47d0be72e 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index 08dbe5c2b..eadd13cdf 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET save-load-state) add_executable(${TARGET} save-load-state.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 07ba76ad3..812a24b09 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -2,6 +2,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) +install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 1568f7364..0ac9cb03a 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET simple) add_executable(${TARGET} simple.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt index 1a44c4961..4459516d0 100644 --- a/examples/train-text-from-scratch/CMakeLists.txt +++ b/examples/train-text-from-scratch/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET train-text-from-scratch) add_executable(${TARGET} train-text-from-scratch.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1acf050a7..11ec6c725 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,7 @@ function(llama_add_test source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) + install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE llama) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() From 45a1b07e9b20c33d71d8c849ff27d693a75a0269 Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:01:55 +0800 Subject: [PATCH 08/16] flake : update flake.nix (#2270) When `isx86_32 || isx86_64`, it will use mkl, else openblas According to https://discourse.nixos.org/t/rpath-of-binary-contains-a-forbidden-reference-to-build/12200/3, add -DCMAKE_SKIP_BUILD_RPATH=ON Fix #2261, Nix doesn't provide mkl-sdl.pc. When we build with -DBUILD_SHARED_LIBS=ON, -DLLAMA_BLAS_VENDOR=Intel10_lp64 replace mkl-sdl.pc by mkl-dynamic-lp64-iomp.pc --- CMakeLists.txt | 11 ++++++++- README.md | 2 +- flake.nix | 63 ++++++++++++++++++++++++++++---------------------- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abc96814d..169332767 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,16 @@ if (LLAMA_BLAS) pkg_check_modules(DepBLAS REQUIRED flexiblas_api) elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + pkg_check_modules(DepBLAS mkl-sdl) + if (NOT DepBLAS) + if (BUILD_SHARED_LIBS) + set(LINK_METHOD dynamic) + else() + set(LINK_METHOD static) + endif() + string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR}) + pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp) + endif() elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") # this doesn't provide pkg-config # suggest to assign BLAS_INCLUDE_DIRS on your own diff --git a/README.md b/README.md index f45e4bf08..073b621e9 100644 --- a/README.md +++ b/README.md @@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements ```bash mkdir build cd build - cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake --build . --config Release ``` diff --git a/flake.nix b/flake.nix index dda7ee7b7..5657e8258 100644 --- a/flake.nix +++ b/flake.nix @@ -6,24 +6,27 @@ outputs = { self, nixpkgs, flake-utils }: flake-utils.lib.eachDefaultSystem (system: let - inherit (pkgs.stdenv) isAarch64 isDarwin; - inherit (pkgs.lib) optionals; - isM1 = isAarch64 && isDarwin; - osSpecific = if isM1 then - with pkgs.darwin.apple_sdk_11_0.frameworks; [ - Accelerate - MetalKit - MetalPerformanceShaders - MetalPerformanceShadersGraph - ] - else if isDarwin then - with pkgs.darwin.apple_sdk.frameworks; [ - Accelerate - CoreGraphics - CoreVideo - ] - else - [ ]; + inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin; + osSpecific = with pkgs; [ openmpi ] ++ + ( + if isAarch64 && isDarwin then + with pkgs.darwin.apple_sdk_11_0.frameworks; [ + Accelerate + MetalKit + MetalPerformanceShaders + MetalPerformanceShadersGraph + ] + else if isAarch32 && isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] + else if isx86_32 || isx86_64 then + with pkgs; [ mkl ] + else + with pkgs; [ openblas ] + ); pkgs = import nixpkgs { inherit system; }; llama-python = pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); @@ -31,22 +34,28 @@ packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = if isM1 then '' + postPatch = '' substituteInPlace ./ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - '' else - ""; - nativeBuildInputs = with pkgs; [ cmake ]; + ''; + nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; buildInputs = osSpecific; - cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ - "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - "-DLLAMA_METAL=ON" + cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ] + ++ (if isAarch64 && isDarwin then [ + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + "-DLLAMA_METAL=ON" + ] else if isx86_32 || isx86_64 then [ + "-DLLAMA_BLAS=ON" + "-DLLAMA_BLAS_VENDOR=Intel10_lp64" + ] else [ + "-DLLAMA_BLAS=ON" + "-DLLAMA_BLAS_VENDOR=OpenBLAS" ]); installPhase = '' runHook preInstall - mkdir -p $out/bin - mv bin/* $out/bin/ + install -D bin/* -t $out/bin + install -Dm644 lib*.so -t $out/lib mv $out/bin/main $out/bin/llama mv $out/bin/server $out/bin/llama-server From 294f424554c1599784ac9962462fc39ace92d8a5 Mon Sep 17 00:00:00 2001 From: Rinne Date: Wed, 19 Jul 2023 15:06:40 +0800 Subject: [PATCH 09/16] llama : extend API to get max devices at runtime (#2253) --- llama.cpp | 4 ++++ llama.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/llama.cpp b/llama.cpp index fa3b7c03c..3319b7023 100644 --- a/llama.cpp +++ b/llama.cpp @@ -875,6 +875,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } +int llama_max_devices() { + return LLAMA_MAX_DEVICES; +} + bool llama_mmap_supported() { return llama_mmap::SUPPORTED; } diff --git a/llama.h b/llama.h index e744584f2..b676a383b 100644 --- a/llama.h +++ b/llama.h @@ -153,6 +153,8 @@ extern "C" { int32_t n_eval; }; + LLAMA_API int llama_max_devices(); + LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); From 417a85a0010519224cf154eb85d383ffeafeeead Mon Sep 17 00:00:00 2001 From: Shouzheng Liu Date: Thu, 20 Jul 2023 06:32:22 -0400 Subject: [PATCH 10/16] metal: minor q4 optimization and reduce code size (#2248) * metal: use uint16_t instead of uint8_t. Apple GPU doesn't like uint8_t. For every operation on uint8_t the gpu need to copy the uint8_t to an empty 16 bit register, then it can issue other instructions. For the matrix-vector multiplication kernel only, we observed a 340~350 GB/s memory read speed on M1 Max after this commit, which is very close to the reported hardware limit. * metal: update rms_norm kernel This commit double the speed of rms_norm operations by using 512 threads per threadgroup, combining with SIMD primitives to minimize the need for thread group barriers. * metal: use template to reduce size Revert modifications on block_q4_0 and block_q4_1. --- ggml-metal.m | 4 +- ggml-metal.metal | 270 +++++++++++++++++++---------------------------- 2 files changed, 110 insertions(+), 164 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index ee205bcdf..d80a380d7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -792,7 +792,7 @@ void ggml_metal_graph_compute( const float eps = 1e-6f; - const int nth = 256; + const int nth = 512; [encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -800,7 +800,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0]; const int64_t nrows = ggml_nrows(src0); diff --git a/ggml-metal.metal b/ggml-metal.metal index 9f9a4fbd7..ee56336ac 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -331,26 +331,33 @@ kernel void kernel_rms_norm( threadgroup float * sum [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01); + device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01); + device const float * x_scalar = (device const float *) x; + float4 sumf=0; + float all_sum=0; // parallel sum - sum[tpitg] = 0.0f; - for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - sum[tpitg] += x[i00] * x[i00]; + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { + sumf += x[i00] * x[i00]; + } + all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3]; + all_sum = simd_sum(all_sum); + if (tiisg == 0) { + sum[sgitg] = all_sum; } - // reduce threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = ntg/2; i > 0; i /= 2) { - if (tpitg < i) { - sum[tpitg] += sum[tpitg + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); + // broadcast, simd group number is ntg / 32 + for (int i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } } - - // broadcast if (tpitg == 0) { + for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];} sum[0] /= ne00; } @@ -359,16 +366,101 @@ kernel void kernel_rms_norm( const float mean = sum[0]; const float scale = 1.0f/sqrt(mean + eps); - device float * y = dst + tgpig*ne00; - for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + device float4 * y = (device float4 *) (dst + tgpig*ne00); + device float * y_scalar = (device float *) y; + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { y[i00] = x[i00] * scale; } + if (tpitg == 0) { + for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;} + } +} + +// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i]) +float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) { + float d = qb_curr->d; + float4 acc = 0.f; + device uint16_t * qs = ((device uint16_t *)qb_curr + 1); + for (int i = 0; i < 16; i+=2) { + acc[0] += yl[i] * (qs[i / 2] & 0x000F); + acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0); + acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000); + } + return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f); +} + +// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i]) +float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) { + float d = qb_curr->d; + float m = qb_curr->m; + float4 acc = 0.f; + device uint16_t * qs = ((device uint16_t *)qb_curr + 2); + for (int i = 0; i < 16; i+=2) { + acc[0] += yl[i] * (qs[i / 2] & 0x000F); + acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0); + acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000); + } + return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m; } // putting them in the kernel cause a significant performance penalty #define N_DST 4 // each SIMD group works on 4 rows #define N_SIMDGROUP 2 // number of SIMD groups in a thread group #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 +template +void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst, + int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01, + uint2 tgpig, uint tiisg, uint sgitg) { + const int nb = ne00/QK4_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; + device const float * y = (device const float *) src1 + r1*ne10; + float4 y_curr[8]; // src1 vector cache + float sumf[N_DST]={0.f}, all_sum; + thread float * yl=(thread float *)y_curr; + + // each thread in a SIMD group deals with 1 block. + for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } + + for (int row = 0; row < N_DST; row++) { + sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl); + } + } + + // from now loads two rows every time and 16 blocks per row + int ir = tiisg / (N_SIMDWIDTH / 2); + int ib = tiisg % (N_SIMDWIDTH / 2); + for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) { + int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } + + for (int row = 0; row < N_DST; row+=2) { + if (nb_start + ib < nb) { + sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl); + } + } + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } +} + kernel void kernel_mul_mat_q4_0_f32( device const void * src0, device const float * src1, @@ -380,80 +472,7 @@ kernel void kernel_mul_mat_q4_0_f32( uint2 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - const int nb = ne00/QK4_0; - const int r0 = tgpig.x; - const int r1 = tgpig.y; - device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; - device const float * y = (device const float *) src1 + r1*ne10; - block_q4_0 qb_curr, qb_next; - float4 y_curr[8]; // src1 vector cache - float sumf[N_DST]={0.f}, all_sum; - thread float * yl=(thread float *)y_curr; - - // bootstrap - qb_curr = x[tiisg]; - // each thread in a SIMD group deals with 1 block. - for (int column = 0; column < nb / N_SIMDWIDTH; column++) { - - float sumy = 0; - for (int i = 0; i < QK4_0 / 4; i++) { - y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); - sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; - } - sumy *= (-8.f); - - for (int row = 0; row < N_DST; row++) { - // prefetch next x block - qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - - // calculate - float d = qb_curr.d; - float acc = sumy; - for (int i = 0; i < 16; i++) { - acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - } - sumf[row] += d * acc; - qb_curr = qb_next; - } - } - - if (nb % N_SIMDWIDTH == 0) { - for (int row = 0; row < N_DST; ++row) { - all_sum = simd_sum(sumf[row]); - if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { - dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; - } - } - } else { - - float sumy = 0; - for (int i = 0; i < QK4_0 / 4; i++) { - y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); - sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; - } - sumy *= (-8.f); - - for (int row = 0; row < N_DST; row++) { - // prefetch next x block - qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - - // calculate - float d = qb_curr.d; - float acc = sumy; - for (int i = 0; i < 16; i++) { - acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - } - if (tiisg < nb % N_SIMDWIDTH) { - sumf[row] += d * acc; - } - qb_curr = qb_next; - - all_sum = simd_sum(sumf[row]); - if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { - dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; - } - } - } + mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg); } kernel void kernel_mul_mat_q4_1_f32( @@ -467,80 +486,7 @@ kernel void kernel_mul_mat_q4_1_f32( uint2 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - const int nb = ne00/QK4_0; - const int r0 = tgpig.x; - const int r1 = tgpig.y; - device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; - device const float * y = (device const float *) src1 + r1*ne10; - block_q4_1 qb_curr, qb_next; - float4 y_curr[8]; // src1 vector cache - float sumf[N_DST]={0.f}, all_sum; - thread float * yl=(thread float *)y_curr; - - // bootstrap - qb_curr = x[tiisg]; - // each thread in a SIMD group deals with 1 block. - for (int column = 0; column < nb / N_SIMDWIDTH; column++) { - - float sumy = 0; - for (int i = 0; i < QK4_0 / 4; i++) { - y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); - sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; - } - - for (int row = 0; row < N_DST; row++) { - // prefetch next x block - qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - - // calculate - const float d = qb_curr.d; - const float m = qb_curr.m; - float acc = 0.f; - for (int i = 0; i < 16; i++) { - acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - } - sumf[row] += d * acc + m * sumy; - qb_curr = qb_next; - } - } - - if (nb % N_SIMDWIDTH == 0) { - for (int row = 0; row < N_DST; ++row) { - all_sum = simd_sum(sumf[row]); - if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { - dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; - } - } - } else { - - float sumy = 0; - for (int i = 0; i < QK4_0 / 4; i++) { - y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); - sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; - } - - for (int row = 0; row < N_DST; row++) { - // prefetch next x block - qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - - // calculate - const float d = qb_curr.d; - const float m = qb_curr.m; - float acc = 0.f; - for (int i = 0; i < 16; i++) { - acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - } - if (tiisg < nb % N_SIMDWIDTH) { - sumf[row] += d * acc + m * sumy; - } - qb_curr = qb_next; - - all_sum = simd_sum(sumf[row]); - if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { - dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; - } - } - } + mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg); } kernel void kernel_mul_mat_f16_f32( From fff0e0eafe817eef429ecb64f892ab7bdae31846 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Jul 2023 13:47:26 +0300 Subject: [PATCH 11/16] llama : fix regression from #2000 - could not load no-mmap models --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 3319b7023..796dfdacb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -555,7 +555,9 @@ struct llama_file_loader { } // skip to the next multiple of 32 bytes - file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + } tensor.file_off = file.tell(); tensor.name = name; From 785829dfe8baf0213f2ff66963d28c62f92d7930 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 20 Jul 2023 15:18:43 +0300 Subject: [PATCH 12/16] Faster Q4_K on Metal (#2290) Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 7 +- ggml-metal.metal | 262 ++++++++++++++++++++++++++++------------------- 2 files changed, 160 insertions(+), 109 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index d80a380d7..5e2a21100 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -694,8 +694,8 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne12 == 1); - nth0 = 4; - nth1 = 16; + nth0 = 2; + nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; } break; case GGML_TYPE_Q5_K: @@ -739,7 +739,8 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || + src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q2_K || diff --git a/ggml-metal.metal b/ggml-metal.metal index ee56336ac..a9d134d6e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1452,6 +1452,7 @@ kernel void kernel_mul_mat_q3_K_f32( } +#if QK_K == 256 kernel void kernel_mul_mat_q4_K_f32( device const void * src0, device const float * src1, @@ -1459,131 +1460,180 @@ kernel void kernel_mul_mat_q4_K_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { - - const int nb = ne00/QK_K; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - - const int nth = tptg.x*tptg.y; - const int ith = tptg.y*tpitg.x + tpitg.y; - - device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb; - device const float * yy = (device const float *) src1 + r1*ne10; - - float sumf = 0; - -#if QK_K == 256 + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; - const int tid = tpitg.y; // 0...16 - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 4; + const int ix = tiisg/8; // 0...3 + const int it = tiisg%8; // 0...7 + const int im = it/4; // 0 or 1 + const int ir = it%4; // 0...3 - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row; + device const float * y = (device const float *) src1 + r1*ne10; + float yl[16]; + float yh[16]; + float sumf[N_DST]={0.f}, all_sum; - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; + const int step = sizeof(block_q4_K) * nb / 2; - uchar2 sc1, sc2, sc3, sc4; + device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir; - for (int i = tpitg.x; i < nb; i += tptg.x) { + uint16_t sc16[4]; + thread const uint8_t * sc8 = (thread const uint8_t *)sc16; - device const uint8_t * q1 = (x + i)->qs + q_offset; - device const uint8_t * q2 = q1 + 64; - device const float * y1 = yy + i*QK_K + y_offset; - device const float * y2 = y1 + 128; - - const float dall = (float)((x + i)->d); - const float dmin = (float)((x + i)->dmin); - - device const uint16_t * a = (device const uint16_t *)(x + i)->scales; - sc1 = as_type((uint16_t)(a[im+0] & kmask1)); - sc2 = as_type((uint16_t)(a[im+2] & kmask1)); - sc3 = as_type((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2))); - sc4 = as_type((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2))); - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < n; ++l) { - - s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4); - s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4); - smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1]; + for (int ib = ix; ib < nb; ib += 4) { + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0]; + yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8]; + yh[i+0] = y4[i+128]; sumy[2] += yh[i+0]; + yh[i+8] = y4[i+160]; sumy[3] += yh[i+8]; } - sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; + device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im; + device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const half * dh = &x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + sc16[0] = sc[0] & kmask1; + sc16[1] = sc[2] & kmask1; + sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2); + sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2); + + device const uint16_t * q2 = q1 + 32; + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+0] * (q1[i/2] & 0x000F); + acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00); + acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0); + acc1[3] += yl[i+9] * (q1[i/2] & 0xF000); + acc2[0] += yh[i+0] * (q2[i/2] & 0x000F); + acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00); + acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0); + acc2[3] += yh[i+9] * (q2[i/2] & 0xF000); + } + + float dall = dh[0]; + float dmin = dh[1]; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] + + (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f + + (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] + + (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) - + dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); + + q1 += step; + sc += step; + dh += step; + } + + y4 += 4 * QK_K; } -#else - uint16_t aux16[2]; - thread const uint8_t * scales = (thread const uint8_t *)aux16; - const int il = 4*tpitg.x; - - for (int i = tpitg.y; i < nb; i += tptg.y) { - - device const uint8_t * q = x[i].qs + il; - device const float * y = yy + i * QK_K + il; - - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; - - device const uint16_t * a = (device const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - for (int l = 0; l < 4; ++l) { - sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16]) - + d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]); + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + first_row + row] = all_sum; } } -#endif - - sum[ith] = sumf; - - // - // Accumulate the sum from all threads in the threadgroup - // This version is slightly faster than the commented out one below, - // which I copy-pasted from ggerganov's q4_0 dot product for metal. - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; - } - - //// accumulate the sum from all threads in the threadgroup - //threadgroup_barrier(mem_flags::mem_threadgroup); - //for (uint i = nth/2; i > 0; i /= 2) { - // if (ith < i) { - // sum[ith] += sum[ith + i]; - // } - // threadgroup_barrier(mem_flags::mem_threadgroup); - //} - - //if (ith == 0) { - // dst[r1*ne0 + r0] = sum[0]; - //} } +#else +kernel void kernel_mul_mat_q4_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne10, + constant int64_t & ne0, + constant int64_t & ne01[[buffer(4)]], + uint2 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int ix = tiisg/4; // 0...7 + const int it = tiisg%4; // 0...3 + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row; + device const float * y = (device const float *) src1 + r1*ne10; + float yl[8]; + float yh[8]; + float sumf[N_DST]={0.f}, all_sum; + + const int step = sizeof(block_q4_K) * nb / 2; + + device const float * y4 = y + ix * QK_K + 8 * it; + + uint16_t sc16[4]; + + for (int ib = ix; ib < nb; ib += 8) { + + float2 sumy = {0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i] = y4[i+ 0]; sumy[0] += yl[i]; + yh[i] = y4[i+32]; sumy[1] += yh[i]; + } + + device const uint16_t * sc = (device const uint16_t *)x[ib].scales; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; + device const half * dh = x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + sc16[0] = sc[0] & 0x000f; + sc16[1] = sc[0] & 0x0f00; + sc16[2] = sc[0] & 0x00f0; + sc16[3] = sc[0] & 0xf000; + + float2 acc1 = {0.f, 0.f}; + float2 acc2 = {0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+0] * (qs[i/2] & 0x000F); + acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00); + acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0); + acc2[1] += yh[i+1] * (qs[i/2] & 0xF000); + } + + float dall = dh[0]; + float dmin = dh[1]; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] + + (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) - + dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f); + + qs += step; + sc += step; + dh += step; + } + + y4 += 8 * QK_K; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + first_row + row] = all_sum; + } + } +} +#endif kernel void kernel_mul_mat_q5_K_f32( device const void * src0, From e782c9e735f93ab4767ffc37462c523b73a17ddc Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 20 Jul 2023 18:19:45 +0300 Subject: [PATCH 13/16] Faster Q5_K and Q6_K on Metal (#2294) * Faster Q6_K on Metal * Faster Q5_K on Metal * Another Q5_K speedup --------- Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 19 ++-- ggml-metal.metal | 230 ++++++++++++++++++++++++++--------------------- 2 files changed, 137 insertions(+), 112 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 5e2a21100..44d046877 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -703,8 +703,8 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne12 == 1); - nth0 = 4; - nth1 = 16; + nth0 = 2; + nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; } break; case GGML_TYPE_Q6_K: @@ -712,8 +712,8 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne12 == 1); - nth0 = 4; - nth1 = 16; + nth0 = 2; + nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; } break; default: @@ -743,11 +743,14 @@ void ggml_metal_graph_compute( src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_Q3_K || - src0t == GGML_TYPE_Q4_K || - src0t == GGML_TYPE_Q5_K || - src0t == GGML_TYPE_Q6_K) { + src0t == GGML_TYPE_Q3_K) { [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { diff --git a/ggml-metal.metal b/ggml-metal.metal index a9d134d6e..f71e8f33b 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1642,39 +1642,39 @@ kernel void kernel_mul_mat_q5_K_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb; + const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; + + device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb; device const float * yy = (device const float *) src1 + r1*ne10; - const int nth = tptg.x*tptg.y; - const int ith = tptg.y*tpitg.x + tpitg.y; + float sumf[2]={0.f}; - float sumf = 0; + const int step = sizeof(block_q5_K) * nb; #if QK_K == 256 +# + float yl[16], yh[16]; const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; - const int tid = tpitg.y; // 0...16 - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 4; + const int tid = tiisg/4; + const int ix = tiisg%4; + const int im = tid/4; + const int ir = tid%4; + const int n = 8; - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); + const int l0 = n*ir; const int q_offset = 32*im + l0; const int y_offset = 64*im + l0; @@ -1683,78 +1683,114 @@ kernel void kernel_mul_mat_q5_K_f32( const uint8_t hm3 = hm1 << 4; const uint8_t hm4 = hm2 << 4; - uchar2 sc1, sc2, sc3, sc4; + uint16_t sc16[4]; + thread const uint8_t * sc8 = (thread const uint8_t *)sc16; - for (int i = tpitg.x; i < nb; i += tptg.x) { + device const float * y1 = yy + ix*QK_K + y_offset; - device const uint8_t * q1 = (x + i)->qs + q_offset; - device const uint8_t * q2 = q1 + 64; - device const uint8_t * qh = (x + i)->qh + l0; - device const float * y1 = yy + i*QK_K + y_offset; - device const float * y2 = y1 + 128; + for (int i = ix; i < nb; i += 4) { - const float dall = (float)((x + i)->d); - const float dmin = (float)((x + i)->dmin); + device const uint8_t * q1 = x[i].qs + q_offset; + device const uint8_t * qh = x[i].qh + l0; + device const half * dh = &x[i].d; + device const uint16_t * a = (device const uint16_t *)x[i].scales + im; - device const uint16_t * a = (device const uint16_t *)(x + i)->scales; - sc1 = as_type((uint16_t)(a[im+0] & kmask1)); - sc2 = as_type((uint16_t)(a[im+2] & kmask1)); - sc3 = as_type((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2))); - sc4 = as_type((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2))); + device const float * y2 = y1 + 128; + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < 8; ++l) { + yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0]; + yl[l+8] = y1[l+32]; sumy[1] += yl[l+8]; + yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0]; + yh[l+8] = y2[l+32]; sumy[3] += yh[l+8]; + } - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < n; ++l) { + for (int row = 0; row < 2; ++row) { - s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0)); - s[1] += y1[l+32] * ((q1[l] >> 4) + (qh[l] & hm2 ? 16 : 0)); - s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0)); - s[3] += y2[l+32] * ((q2[l] >> 4) + (qh[l] & hm4 ? 16 : 0)); - smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1]; + device const uint8_t * q2 = q1 + 64; + + sc16[0] = a[0] & kmask1; + sc16[1] = a[2] & kmask1; + sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); + sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); + + float4 acc = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < n; ++l) { + uint8_t h = qh[l]; + acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0)); + acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0)); + acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0)); + acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0)); + } + const float dall = dh[0]; + const float dmin = dh[1]; + sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) - + dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); + + q1 += step; + qh += step; + dh += step/2; + a += step/2; } - sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; + + y1 += 4 * QK_K; } #else - const int il = 4 * tpitg.x; // 0, 4, 8, 12 - const int im = il/8; // 0, 0, 1, 1 - const int in = il%8; // 0, 4, 0, 4 + float yl[8], yh[8]; - for (int i = tpitg.y; i < nb; i += tptg.y) { + const int il = 4 * (tiisg/8); // 0, 4, 8, 12 + const int ix = tiisg%8; + const int im = il/8; // 0, 0, 1, 1 + const int in = il%8; // 0, 4, 0, 4 - const float d = (float)x[i].d; + device const float * y = yy + ix*QK_K + il; + + for (int i = ix; i < nb; i += 8) { + + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < 4; ++l) { + yl[l+0] = y[l+ 0]; + yl[l+4] = y[l+16]; + yh[l+0] = y[l+32]; + yh[l+4] = y[l+48]; + } + + device const half * dh = &x[i].d; device const uint8_t * q = x[i].qs + il; device const uint8_t * h = x[i].qh + in; device const int8_t * s = x[i].scales; - device const float * y = yy + i*QK_K + il; - for (int l = 0; l < 4; ++l) { - const uint8_t hl = h[l] >> im; - sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16)) - + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16)) - + y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16)) - + y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16)); + for (int row = 0; row < 2; ++row) { + + const float d = dh[0]; + + float2 acc = {0.f, 0.f}; + for (int l = 0; l < 4; ++l) { + const uint8_t hl = h[l] >> im; + acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16)) + + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16)); + acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256)) + + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256)); + } + sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]); + + q += step; + h += step; + s += step; + dh += step/2; + } + + y += 8 * QK_K; } #endif - sum[ith] = sumf; - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + for (int row = 0; row < 2; ++row) { + const float tot = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + first_row + row] = tot; + } } } @@ -1766,10 +1802,9 @@ kernel void kernel_mul_mat_q6_K_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const uint8_t kmask1 = 0x03; const uint8_t kmask2 = 0x0C; @@ -1781,19 +1816,18 @@ kernel void kernel_mul_mat_q6_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const int row = 2 * r0 + sgitg; - const int nth = tptg.x*tptg.y; - const int ith = tptg.y*tpitg.x + tpitg.y; + device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb; + device const float * yy = (device const float *) src1 + r1*ne10; float sumf = 0; #if QK_K == 256 - // Note: we absolutely assume that tptg.y = 16 and QK_K = 256! - const int iqs = 16 * tpitg.y; - const int ip = iqs / 128; // 0 or 1 - const int il = (iqs - 128*ip)/16; // 0...7 + const int tid = tiisg/2; + const int ix = tiisg%2; + const int ip = tid/8; // 0 or 1 + const int il = tid%8; const int n = 4; const int l0 = n*il; const int is = 8*ip + l0/16; @@ -1802,9 +1836,10 @@ kernel void kernel_mul_mat_q6_K_f32( const int q_offset_l = 64*ip + l0; const int q_offset_h = 32*ip + l0; - for (int i = tpitg.x; i < nb; i += tptg.x) { + for (int i = ix; i < nb; i += 2) { - device const uint8_t * ql = x[i].ql + q_offset_l; + device const uint8_t * q1 = x[i].ql + q_offset_l; + device const uint8_t * q2 = q1 + 32; device const uint8_t * qh = x[i].qh + q_offset_h; device const int8_t * sc = x[i].scales + is; @@ -1814,19 +1849,21 @@ kernel void kernel_mul_mat_q6_K_f32( float4 sums = {0.f, 0.f, 0.f, 0.f}; for (int l = 0; l < n; ++l) { - sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); - sums[1] += y[l+32] * ((int8_t)((ql[l+32] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); - sums[2] += y[l+64] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) << 0)) - 32); - sums[3] += y[l+96] * ((int8_t)((ql[l+32] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); + sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); + sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); + sums[2] += y[l+64] * ((int8_t)((q1[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32); + sums[3] += y[l+96] * ((int8_t)((q2[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); } sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); } -#else - const int il = 4*tpitg.x; // 0, 4, 8, 12 - for (int i = tpitg.y; i < nb; i += tptg.y) { +#else + const int ix = tiisg/4; + const int il = 4*(tiisg%4); + + for (int i = ix; i < nb; i += 8) { device const float * y = yy + i * QK_K + il; device const uint8_t * ql = x[i].ql + il; device const uint8_t * qh = x[i].qh + il; @@ -1846,23 +1883,8 @@ kernel void kernel_mul_mat_q6_K_f32( #endif - sum[ith] = sumf; - - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; + const float tot = simd_sum(sumf); + if (tiisg == 0) { + dst[r1*ne0 + row] = tot; } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; - } - } From 9cf022a1889e50113fd348dc96b4557fc75a6296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Pawe=C5=82czyk?= Date: Fri, 21 Jul 2023 09:42:21 +0200 Subject: [PATCH 14/16] make : fix embdinput library and server examples building on MSYS2 (#2235) * make : fix embdinput library and server examples building on MSYS2 * cmake : fix server example building on MSYS2 --- Makefile | 34 ++++++++++++++++++++++++++++------ examples/server/CMakeLists.txt | 3 +++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 6c74e1346..cff4d97fe 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test default: $(BUILD_TARGETS) @@ -90,6 +90,28 @@ ifeq ($(UNAME_S),Haiku) CXXFLAGS += -pthread endif +# detect Windows +ifneq ($(findstring _NT,$(UNAME_S)),) + _WIN32 := 1 +endif + +# library name prefix +ifneq ($(_WIN32),1) + LIB_PRE := lib +endif + +# Dynamic Shared Object extension +ifneq ($(_WIN32),1) + DSO_EXT := .so +else + DSO_EXT := .dll +endif + +# Windows Sockets 2 (Winsock) for network-capable apps +ifeq ($(_WIN32),1) + LWINSOCK2 := -lws2_32 +endif + ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg @@ -294,7 +316,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h + rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h # # Examples @@ -325,14 +347,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) +$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) -embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput +embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 812a24b09..3782f9b80 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +if (WIN32) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) +endif() target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) From e68c96f7fee8fc22814a4a1209ffc97bbf35f7bd Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 21 Jul 2023 10:44:40 +0300 Subject: [PATCH 15/16] Faster Q2_K on Metal (#2297) * Faster Q2_K on Metal * Deleting unnoticed and dangereous trailing white space * Fixed bug in new metal Q2_K implementation --------- Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 9 ++- ggml-metal.metal | 183 +++++++++++++++++++++++++++-------------------- 2 files changed, 108 insertions(+), 84 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 44d046877..135bda9fc 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -676,8 +676,8 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne12 == 1); - nth0 = 4; - nth1 = 16; + nth0 = 2; + nth1 = 32; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; } break; case GGML_TYPE_Q3_K: @@ -740,7 +740,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || - src0t == GGML_TYPE_Q4_K) { + src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q5_K) { @@ -749,8 +749,7 @@ void ggml_metal_graph_compute( else if (src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_Q3_K) { + else if (src0t == GGML_TYPE_Q3_K) { [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { diff --git a/ggml-metal.metal b/ggml-metal.metal index f71e8f33b..97f5c10ba 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1209,108 +1209,133 @@ kernel void kernel_mul_mat_q2_K_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row; + device const float * y = (device const float *) src1 + r1*ne10; + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; - device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb; - device const float * yy = (device const float *) src1 + r1*ne10; - - const int nth = tptg.x*tptg.y; - const int ith = tptg.y*tpitg.x + tpitg.y; - - float sumf = 0; + const int step = sizeof(block_q2_K) * nb; #if QK_K == 256 - const int tid = tpitg.y; // 0...16 - const int il = tid/4; // 0...3 - const int ir = tid%4; // 0...3 - const int ip = il/2; // 0 or 1 - const int shift1 = 4*(il%2);// 0 or 4 - const int shift2 = shift1+2;// 2 or 6 - const int n = 8; - const int is = 4*il + (n*ir)/16; + const int ix = tiisg/8; // 0...3 + const int it = tiisg%8; // 0...7 + const int im = it/4; // 0 or 1 + const int ir = it%4; // 0...3 + const int is = (8*ir)/16;// 0 or 1 - const int y_offset = 64*il + n*ir; - const int q_offset = 32*ip + n*ir; + device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir; - for (int i = tpitg.x; i < nb; i += tptg.x) { + for (int ib = ix; ib < nb; ib += 4) { - device const uint8_t * q = x[i].qs + q_offset; - device const uint8_t * scales = x[i].scales + is; - - uint8_t d1 = scales[0] & 0xF; - uint8_t d2 = scales[2] & 0xF; - uint8_t m1 = scales[0] >> 4; - uint8_t m2 = scales[2] >> 4; - - device const float * y = yy + i*QK_K + y_offset; - - float2 s = {0.f, 0.f}; - float smin = 0; - for (int l = 0; l < n; ++l) { - s[0] += y[l+ 0] * ((q[l] >> shift1) & 3); - s[1] += y[l+32] * ((q[l] >> shift2) & 3); - smin += y[l+ 0] * m1 + y[l+32] * m2; + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+64]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+96]; sumy[3] += yl[i+24]; } - const float dall = (float)x[i].d; - const float dmin = (float)x[i].dmin; + device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*im + is; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const half * dh = &x[ib].d; - sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin; + for (int row = 0; row < N_DST; row++) { + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); + acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); + acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); + acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); + acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); + acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); + acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); + acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); + } + float dall = dh[0]; + float dmin = dh[1] * 1.f/16.f; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + + (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f + + (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f + + (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) - + dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0)); + + qs += step/2; + sc += step; + dh += step/2; + } + + y4 += 4 * QK_K; } #else - const int il = 4 * tpitg.x; + const int ix = tiisg/2; // 0...15 + const int it = tiisg%2; // 0...1 - uint32_t aux[2]; - thread const uint8_t * d = (thread const uint8_t *)aux; - thread const uint8_t * m = (thread const uint8_t *)aux + 4; + device const float * y4 = y + ix * QK_K + 8 * it; - for (int i = tpitg.y; i < nb; i += tptg.y) { + for (int ib = ix; ib < nb; ib += 16) { - device const uint8_t * q = x[i].qs + il; - device const float * y = yy + i*QK_K + il; - - const float dall = (float)x[i].d; - const float dmin = (float)x[i].dmin; - - device const uint32_t * a = (device const uint32_t *)x[i].scales; - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = (a[0] >> 4) & 0x0f0f0f0f; - - for (int l = 0; l < 4; ++l) { - sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0]) - + y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1]) - + y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2]) - + y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]); + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+32]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+48]; sumy[3] += yl[i+24]; } + + device const uint8_t * sc = (device const uint8_t *)x[ib].scales; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; + device const half * dh = &x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); + acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); + acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); + acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); + acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); + acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); + acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); + acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); + } + + float dall = dh[0]; + float dmin = dh[1]; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + + (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f + + (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f + + (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) - + dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4)); + + qs += step/2; + sc += step; + dh += step/2; + } + + y4 += 16 * QK_K; } #endif - sum[ith] = sumf; - - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + first_row + row] = all_sum; + } } } From 019fe257bbf699f400231683a8b816ad90281275 Mon Sep 17 00:00:00 2001 From: Hatsune Miku <129688334+at8u@users.noreply.github.com> Date: Fri, 21 Jul 2023 08:13:18 +0000 Subject: [PATCH 16/16] =?UTF-8?q?MIKU=20MAYHEM:=20Upgrading=20the=20Defaul?= =?UTF-8?q?t=20Model=20for=20Maximum=20Fun=20=F0=9F=8E=89=20(#2287)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Miku.sh: Set default model to llama-2-7b-chat * Miku.sh: Set ctx_size to 4096 * Miku.sh: Add in-prefix/in-suffix opts * Miku.sh: Switch sampler to mirostat_v2 and tiny prompt improvements --- examples/Miku.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/Miku.sh b/examples/Miku.sh index c44d9ae74..b9174b4e6 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -2,21 +2,21 @@ set -e AI_NAME="${AI_NAME:-Miku}" -MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" +MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}" USER_NAME="${USER_NAME:-Anon}" # Uncomment and adjust to the number of CPU cores you want to use. #N_THREAD="${N_THREAD:-4}" +CTX_SIZE="${CTX_SIZE:-4096}" N_PREDICTS="${N_PREDICTS:-4096}" GEN_OPTIONS=(--batch_size 1024 ---ctx_size 2048 +--ctx_size "$CTX_SIZE" --keep -1 --repeat_last_n 256 --repeat_penalty 1.17647 ---temp 0.7 ---top_k 40 ---top_p 0.5) +--temp 0.6 +--mirostat 2) if [ -n "$N_THREAD" ]; then GEN_OPTIONS+=(--threads "$N_THREAD") @@ -24,16 +24,17 @@ fi ./main "${GEN_OPTIONS[@]}" \ --model "$MODEL" \ + --in-prefix " " \ + --in-suffix "${AI_NAME}:" \ --n_predict "$N_PREDICTS" \ --color --interactive \ --reverse-prompt "${USER_NAME}:" \ - --prompt " -This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer. + --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help. ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad. ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her. -The conversation is only between ${USER_NAME} and ${AI_NAME} +The conversation is only between ${USER_NAME} and ${AI_NAME}. The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. ${AI_NAME} can only communicate through text, so she can't send images or videos.