diff --git a/examples/base-translate.sh b/examples/base-translate.sh new file mode 100755 index 000000000..00dedd0df --- /dev/null +++ b/examples/base-translate.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# +# Few-shot translation example. +# Requires a base model (i.e. no fine-tuned or instruct models). +# +# Usage: +# +# cd llama.cpp +# make -j +# +# ./examples/base-translate.sh "" [extra-main-args] +# + +if [ $# -lt 2 ]; then + echo "Usage: ./base-translate.sh \"\" [extra-main-args]" + exit 1 +fi + +eargs="" +if [ $# -gt 2 ]; then + eargs="${@:3}" +fi + +ftmp="__llama.cpp_example_tmp__.txt" +trap "rm -f $ftmp" EXIT + +echo "Translate from English to French: + +=== + +sea otter, peppermint, plush girafe: + +sea otter => loutre de mer +peppermint => menthe poivrée +plush girafe => girafe peluche + +=== + +violin + +violin => violon + +=== + +phone, computer, mouse, keyboard: + +phone => téléphone +computer => ordinateur +mouse => souris +keyboard => clavier + +=== +" > $ftmp + +echo "$2 +" >> $ftmp + +model=$1 + +# generate the most likely continuation until the string "===" is found +./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d0f96946c..2e570d3c5 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -139,6 +139,7 @@ struct cmd_params { std::vector n_threads; std::vector n_gpu_layers; std::vector main_gpu; + std::vector no_kv_offload; std::vector mul_mat_q; std::vector> tensor_split; int reps; @@ -156,6 +157,7 @@ static const cmd_params cmd_params_defaults = { /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, /* main_gpu */ {0}, + /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, /* tensor_split */ {{}}, /* reps */ 5, @@ -177,6 +179,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split \n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -310,6 +313,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.main_gpu = split(argv[i], split_delim); + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); } else if (arg == "-mmq" || arg == "--mul-mat-q") { if (++i >= argc) { invalid_param = true; @@ -384,6 +394,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -401,6 +412,7 @@ struct cmd_params_instance { int n_threads; int n_gpu_layers; int main_gpu; + bool no_kv_offload; bool mul_mat_q; std::array tensor_split; @@ -429,6 +441,7 @@ struct cmd_params_instance { cparams.type_k = type_k; cparams.type_v = type_v; cparams.mul_mat_q = mul_mat_q; + cparams.offload_kqv = !no_kv_offload; return cparams; } @@ -445,6 +458,7 @@ static std::vector get_cmd_params_instances_int(const cmd_p for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { /* .model = */ m, @@ -456,6 +470,7 @@ static std::vector get_cmd_params_instances_int(const cmd_p /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -477,6 +492,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -492,6 +508,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -512,6 +529,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -560,6 +578,7 @@ struct test { ggml_type type_v; int n_gpu_layers; int main_gpu; + bool no_kv_offload; bool mul_mat_q; std::array tensor_split; int n_prompt; @@ -580,6 +599,7 @@ struct test { type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; n_prompt = inst.n_prompt; @@ -641,7 +661,8 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", + "n_gpu_layers", "main_gpu", "no_kv_offload", + "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -660,7 +681,7 @@ struct test { return INT; } if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "mul_mat_q") { + field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -691,7 +712,8 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, + std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -852,6 +874,9 @@ struct markdown_printer : public printer { if (field == "mul_mat_q") { return "mmq"; } + if (field == "no_kv_offload") { + return "nkvo"; + } if (field == "tensor_split") { return "ts"; } @@ -886,6 +911,9 @@ struct markdown_printer : public printer { if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } + if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { + fields.push_back("no_kv_offload"); + } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.push_back("tensor_split"); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 66244382f..fc79fd346 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -1,8 +1,5 @@ import Foundation - -// To use this in your own project, add llama.cpp as a swift package dependency -// and uncomment this import line. -// import llama +import llama enum LlamaError: Error { case couldNotInitializeContext @@ -161,7 +158,7 @@ actor LlamaContext { new_token_id = llama_sample_token_greedy(context, &candidates_p) } - if new_token_id == llama_token_eos(context) || n_cur == n_len { + if new_token_id == llama_token_eos(model) || n_cur == n_len { print("\n") let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() diff --git a/examples/llama.swiftui/llama.cpp.swift/bridging-header.h b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h deleted file mode 100644 index 6cd72c979..000000000 --- a/examples/llama.swiftui/llama.cpp.swift/bridging-header.h +++ /dev/null @@ -1,5 +0,0 @@ -// -// Use this file to import your target's public headers that you would like to expose to Swift. -// - -#import "llama.h" diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index a70750a22..a8848a49f 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -7,68 +7,31 @@ objects = { /* Begin PBXBuildFile section */ - 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; - 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; - 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; - 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; - 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; - 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; - 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; + DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; }; F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; }; - F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; /* End PBXBuildFile section */ -/* Begin PBXBuildRule section */ - F1FE20DB2B465C2100B45541 /* PBXBuildRule */ = { - isa = PBXBuildRule; - compilerSpec = com.apple.compilers.proxy.script; - fileType = sourcecode.metal; - inputFiles = ( - ); - isEditable = 1; - outputFiles = ( - "${DERIVED_FILES_DIR}/ggml-metal.air", - "${DERIVED_FILES_DIR}/ggml.metallib", - ); - script = "# metal\nxcrun metal -c \"${INPUT_FILE_PATH}\" -o \"${DERIVED_FILES_DIR}/${INPUT_FILE_BASE}.air\"\nxcrun metallib -o \"${DERIVED_FILES_DIR}/${INPUT_FILE_BASE%-metal}.metallib\" \"${DERIVED_FILES_DIR}/${INPUT_FILE_BASE}.air\"\n"; - }; -/* End PBXBuildRule section */ - /* Begin PBXFileReference section */ - 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = ""; }; - 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = ""; }; - 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = ""; }; - 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = ""; }; - 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = ""; }; - 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = ""; }; - 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = ""; }; - 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = ""; }; - 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = ""; }; - 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = ""; }; - 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = ""; }; - 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = ""; }; - 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = ""; }; 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = ""; }; - 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = ""; }; 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; + DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = ""; }; F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = ""; }; /* End PBXFileReference section */ @@ -77,6 +40,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + DF810E132B4A5BA200301144 /* llama in Frameworks */, 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, ); @@ -85,30 +49,10 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ - 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { - isa = PBXGroup; - children = ( - 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, - 542376092B0D9C40008E6A1C /* ggml-backend.h */, - 542376062B0D9BEA008E6A1C /* ggml-quants.h */, - 542376072B0D9BFB008E6A1C /* ggml-quants.c */, - 549479C82AC9E10B00E0F78B /* ggml-metal.metal */, - 549479C62AC9E0F200E0F78B /* ggml-metal.h */, - 549479C52AC9E0F200E0F78B /* ggml-metal.m */, - 542EA09B2AC8723900A8AEE9 /* ggml.c */, - 542EA09C2AC8723900A8AEE9 /* ggml.h */, - 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, - 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, - 542EA0A12AC8729100A8AEE9 /* llama.cpp */, - 542EA0A22AC8729100A8AEE9 /* llama.h */, - ); - name = llama.cpp; - sourceTree = ""; - }; 8A1C836A2AC328BD0096AF73 = { isa = PBXGroup; children = ( - 8A08D1F62AC7383900FE6CD4 /* llama.cpp */, + DF2D2FE72B4A59BE00FCB72D /* llama.cpp */, 8A907F312AC7134E006146EA /* llama.cpp.swift */, 8A3F84232AC4C891005E2EE8 /* models */, 8A1C83752AC328BD0096AF73 /* llama.swiftui */, @@ -133,19 +77,10 @@ 8A9F7C4A2AC332BF008AE1EA /* UI */, 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, - 8A1C837C2AC328BE0096AF73 /* Preview Content */, ); path = llama.swiftui; sourceTree = ""; }; - 8A1C837C2AC328BE0096AF73 /* Preview Content */ = { - isa = PBXGroup; - children = ( - 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, - ); - path = "Preview Content"; - sourceTree = ""; - }; 8A39BE082AC7601000BFEB40 /* Frameworks */ = { isa = PBXGroup; children = ( @@ -173,7 +108,6 @@ 8A907F312AC7134E006146EA /* llama.cpp.swift */ = { isa = PBXGroup; children = ( - 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, 8A907F322AC7134E006146EA /* LibLlama.swift */, ); path = llama.cpp.swift; @@ -209,12 +143,12 @@ 8A1C83712AC328BD0096AF73 /* Resources */, ); buildRules = ( - F1FE20DB2B465C2100B45541 /* PBXBuildRule */, ); dependencies = ( ); name = llama.swiftui; packageProductDependencies = ( + DF810E122B4A5BA200301144 /* llama */, ); productName = llama.swiftui; productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; @@ -261,9 +195,7 @@ isa = PBXResourcesBuildPhase; buildActionMask = 2147483647; files = ( - F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */, 8A3F84242AC4C891005E2EE8 /* models in Resources */, - 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, ); runOnlyForDeploymentPostprocessing = 0; @@ -275,18 +207,12 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, - 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */, - 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, - 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */, - 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, - 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -416,11 +342,9 @@ isa = XCBuildConfiguration; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; DEVELOPMENT_TEAM = STLSG3FG8Q; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; @@ -437,11 +361,12 @@ MARKETING_VERSION = 1.0; PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; PRODUCT_NAME = "$(TARGET_NAME)"; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator"; + SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO; SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; SWIFT_OPTIMIZATION_LEVEL = "-Onone"; SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; + TARGETED_DEVICE_FAMILY = "1,2,7"; }; name = Debug; }; @@ -449,11 +374,9 @@ isa = XCBuildConfiguration; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; DEVELOPMENT_TEAM = STLSG3FG8Q; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; @@ -470,10 +393,11 @@ MARKETING_VERSION = 1.0; PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; PRODUCT_NAME = "$(TARGET_NAME)"; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator"; + SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO; SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; + TARGETED_DEVICE_FAMILY = "1,2,7"; }; name = Release; }; @@ -499,6 +423,13 @@ defaultConfigurationName = Release; }; /* End XCConfigurationList section */ + +/* Begin XCSwiftPackageProductDependency section */ + DF810E122B4A5BA200301144 /* llama */ = { + isa = XCSwiftPackageProductDependency; + productName = llama; + }; +/* End XCSwiftPackageProductDependency section */ }; rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; } diff --git a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json deleted file mode 100644 index eb8789700..000000000 --- a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AccentColor.colorset/Contents.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "colors" : [ - { - "idiom" : "universal" - } - ], - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json b/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json deleted file mode 100644 index 73c00596a..000000000 --- a/examples/llama.swiftui/llama.swiftui/Preview Content/Preview Assets.xcassets/Contents.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "info" : { - "author" : "xcode", - "version" : 1 - } -} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2062f96f1..d99dc75cf 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -448,8 +448,14 @@ struct llama_client_slot } bool has_budget(gpt_params &global_params) { + if (params.n_predict == -1 && global_params.n_predict == -1) + { + return true; // limitless + } + n_remaining = -1; - if(params.n_predict != -1) + + if (params.n_predict != -1) { n_remaining = params.n_predict - n_decoded; } @@ -457,7 +463,8 @@ struct llama_client_slot { n_remaining = global_params.n_predict - n_decoded; } - return n_remaining > 0 || n_remaining == -1; // no budget || limitless + + return n_remaining > 0; // no budget } bool available() const { @@ -1103,7 +1110,7 @@ struct llama_server_context } // check the limits - if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { slot.stopped_limit = true; slot.has_next_token = false; @@ -1704,7 +1711,6 @@ struct llama_server_context llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true); - slot.n_decoded += 1; slot.n_past += 1; } @@ -1922,6 +1928,7 @@ struct llama_server_context llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + slot.n_decoded += 1; if (slot.n_decoded == 1) { slot.t_start_genereration = ggml_time_us(); diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 05859935a..ca21b4743 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -90,7 +90,7 @@ extern "C" { void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan - void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index 2c3752067..53e741cb8 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_ ggml_backend_synchronize(backend); } -void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - backend->iface.graph_compute(backend, cgraph); +bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + if (!backend->iface.graph_compute(backend, cgraph)) { + return false; + } // TODO: optional sync ggml_backend_synchronize(backend); + return true; } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac GGML_UNUSED(backend); } -static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c cplan.work_data = cpu_ctx->work_data; ggml_graph_compute(cgraph, &cplan); + return true; } static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/ggml-backend.h b/ggml-backend.h index a9d2fddd7..85ff67b0e 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -58,7 +58,7 @@ extern "C" { GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 62e7ab497..d538b6917 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -183,7 +183,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) { static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(__gfx1100__) +#elif defined(RDNA3) c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); #elif defined(__gfx1010__) || defined(__gfx900__) int tmp1; @@ -1873,14 +1873,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const float * x = (const float *) vx; - - // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; -} - static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { const int ix = blockDim.x*blockIdx.x + threadIdx.x; @@ -1984,7 +1976,7 @@ static __global__ void k_get_rows_float( template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { - const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; + const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x); if (i >= k) { return; @@ -2003,6 +1995,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ y[iybs + iqs + y_offset] = v.y; } +template +static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + const src_t * x = (src_t *) vx; + + y[i] = x[i]; +} + // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q @@ -5610,7 +5615,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con template static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); dequantize_block<<>>(vx, y, k); } @@ -5660,6 +5665,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu #endif } +template +static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + convert_unary<<>>(vx, y, k); +} + static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -5683,7 +5694,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q6_K: return dequantize_row_q6_K_cuda; case GGML_TYPE_F32: - return dequantize_block_cuda<1, 1, convert_f32>; + return convert_unary_cuda; default: return nullptr; } @@ -5712,7 +5723,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q6_K: return dequantize_row_q6_K_cuda; case GGML_TYPE_F16: - return dequantize_block_cuda<1, 1, convert_f16>; + return convert_unary_cuda; default: return nullptr; } @@ -9911,7 +9922,7 @@ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_ba UNUSED(plan); } -static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -9968,6 +9979,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph } UNUSED(backend); + + return true; } static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { diff --git a/ggml-metal.h b/ggml-metal.h index b5e02b668..c4b7325da 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel -void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); // // backend API diff --git a/ggml-metal.m b/ggml-metal.m index 940c3d3cd..a9f4ff704 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -258,14 +258,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; #endif NSError * error = nil; - NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"]; + NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; if (libPath != nil) { // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:libPath]; GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; } else { - GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__); + GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * sourcePath; NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; @@ -295,7 +295,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #endif // try to disable fast-math // NOTE: this seems to have no effect whatsoever - // instead, in order to disable fast-math, we have to build ggml.metallib from the command line + // instead, in order to disable fast-math, we have to build default.metallib from the command line // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air // and go through the "pre-compiled library found" path above //[options setFastMathEnabled:false]; @@ -977,7 +977,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { return false; } } -void ggml_metal_graph_compute( +bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @autoreleasepool { @@ -2420,10 +2420,11 @@ preferably one under the recommended max working set size, or else fall back to } else { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); } - GGML_ASSERT(false); + return false; } } + return true; } } @@ -2703,10 +2704,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm UNUSED(backend); } -static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; - ggml_metal_graph_compute(metal_ctx, cgraph); + return ggml_metal_graph_compute(metal_ctx, cgraph); } static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/ggml-quants.h b/ggml-quants.h index 70c12c274..62c1df6cb 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s // 2-bit quantization // weight is represented as x = a * q + b // 16 blocks of 16 elements each -// Effectively 2.5625 bits per weight +// Effectively 2.625 bits per weight typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants diff --git a/ggml.c b/ggml.c index e0bf50713..bab08f876 100644 --- a/ggml.c +++ b/ggml.c @@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + //const int64_t ne00 = src0->ne[0]; //const int64_t ne01 = src0->ne[1]; @@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat( #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (ggml_compute_forward_mul_mat_use_blas(dst)) { if (params->ith != 0) { return; } @@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { //n_tasks = MIN(n_threads, MAX(1, nr0/128)); //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); - -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#endif } break; case GGML_OP_MUL_MAT_ID: { @@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { state->shared->node_n += 1; return (thread_ret_t) GGML_EXIT_ABORTED; } + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again @@ -16556,12 +16539,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } else { // wait for other threads to finish const int last = node_n; - do { - #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_METAL) - //apple does nothing - #else - sched_yield(); - #endif + + const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT; + + while (true) { + // TODO: this sched_yield can have significant impact on the performance - either positive or negative + // depending on the workload and the operating system. + // since it is not clear what is the best approach, it should potentially become user-configurable + // ref: https://github.com/ggerganov/ggml/issues/291 + // UPD: adding the do_yield flag seems to resolve the issue universally + if (do_yield) { + sched_yield(); + } + node_n = atomic_load(&state->shared->node_n); } while (node_n == last); } @@ -16638,7 +16628,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + if (ggml_compute_forward_mul_mat_use_blas(node)) { if (node->src[0]->type != GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); diff --git a/llama.cpp b/llama.cpp index 1aba937fa..e3617c294 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2191,7 +2191,11 @@ struct llama_model_loader { type_max = type; } - // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); + // TODO: make runtime configurable +#if 0 + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); +#endif } switch (type_max) { @@ -4801,7 +4805,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4925,7 +4928,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * pos; @@ -5024,9 +5026,7 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); const int64_t n_rot = n_embd_head_k / 2; @@ -5238,9 +5238,7 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5333,7 +5331,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5429,7 +5426,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5756,7 +5752,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * attn_norm_output; @@ -5980,7 +5975,6 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_gqa == n_embd); struct ggml_tensor * cur; struct ggml_tensor * pos; diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 354246a26..fe7f3202f 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -3fd01e00e40583ccd4b393a7c6502d6a4455a1d5 +f96711108d55bdbbd277e6be07204dce6a94fb93