diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index e5fcb37d6..360602d65 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip + apt-get install -y build-essential python3 python3-pip git COPY requirements.txt requirements.txt diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 30c01196a..2b7faf7c1 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential + apt-get install -y build-essential git WORKDIR /app diff --git a/CMakeLists.txt b/CMakeLists.txt index 14cc02f07..969fd5fd1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,7 +185,7 @@ if (LLAMA_ALL_WARNINGS) ) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") # g++ only - set(cxx_flags ${cxx_flags} -Wno-format-truncation) + set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds) endif() else() # todo : msvc @@ -235,7 +235,7 @@ if (NOT MSVC) endif() endif() -if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") +if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")) message(STATUS "ARM detected") if (MSVC) # TODO: arm msvc? @@ -385,4 +385,4 @@ target_link_libraries(llama PRIVATE ${LLAMA_EXTRA_LIBS} ) add_subdirectory(examples) -endif() \ No newline at end of file +endif() diff --git a/Package.swift b/Package.swift index 96f52c4f0..fb95ef7eb 100644 --- a/Package.swift +++ b/Package.swift @@ -2,8 +2,30 @@ import PackageDescription +#if arch(arm) || arch(arm64) +let platforms: [SupportedPlatform]? = [ + .macOS(.v11), + .iOS(.v14), + .watchOS(.v4), + .tvOS(.v14) +] +let exclude: [String] = [] +let additionalSources: [String] = ["ggml-metal.m"] +let additionalSettings: [CSetting] = [ + .unsafeFlags(["-fno-objc-arc"]), + .define("GGML_SWIFT"), + .define("GGML_USE_METAL") +] +#else +let platforms: [SupportedPlatform]? = nil +let exclude: [String] = ["ggml-metal.metal"] +let additionalSources: [String] = [] +let additionalSettings: [CSetting] = [] +#endif + let package = Package( name: "llama", + platforms: platforms, products: [ .library(name: "llama", targets: ["llama"]), ], @@ -11,23 +33,23 @@ let package = Package( .target( name: "llama", path: ".", - exclude: ["ggml-metal.metal"], + exclude: exclude, sources: [ "ggml.c", "llama.cpp", "ggml-alloc.c", - "k_quants.c" - ], + "k_quants.c", + ] + additionalSources, publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_K_QUANTS"), .define("GGML_USE_ACCELERATE") - ], + ] + additionalSettings, linkerSettings: [ .linkedFramework("Accelerate") ] - ), + ) ], cxxLanguageStandard: .cxx11 ) diff --git a/common/common.cpp b/common/common.cpp index 22f65ac46..6e5d5b4d5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -57,7 +57,7 @@ int32_t get_num_physical_cores() { siblings.insert(line); } } - if (siblings.size() > 0) { + if (!siblings.empty()) { return static_cast(siblings.size()); } #elif defined(__APPLE__) && defined(__MACH__) @@ -773,7 +773,7 @@ std::tuple llama_init_from_gpt_par LOG("warming up the model with an empty run\n"); const std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; - llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads); + llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads); llama_reset_timings(lctx); } diff --git a/common/common.h b/common/common.h index 85ac0df9b..012bf5e13 100644 --- a/common/common.h +++ b/common/common.h @@ -20,6 +20,9 @@ #define DIRECTORY_SEPARATOR '/' #endif // _WIN32 +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0) + // // CLI argument parsing // diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index e76bd11c3..177d1e3a8 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -415,6 +415,7 @@ namespace grammar_parser { std::vector parse_state::c_rules() { std::vector ret; + ret.reserve(rules.size()); for (const auto & rule : rules) { ret.push_back(rule.data()); } diff --git a/convert.py b/convert.py index 59d75141d..4ac5030db 100755 --- a/convert.py +++ b/convert.py @@ -145,7 +145,6 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { class Params: n_vocab: int n_embd: int - n_mult: int n_layer: int n_ctx: int n_ff: int @@ -161,15 +160,6 @@ class Params: # path to the directory containing the model files path_model: Path | None = None - @staticmethod - def find_n_mult(n_ff: int, n_embd: int) -> int: - # hardcoded magic range - for n_mult in range(8192, 1, -1): - calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult - if calc_ff == n_ff: - return n_mult - raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).") - @staticmethod def guessed(model: LazyModel) -> Params: # try transformer naming first @@ -197,7 +187,6 @@ class Params: return Params( n_vocab = n_vocab, n_embd = n_embd, - n_mult = n_mult, n_layer = n_layer, n_ctx = -1, n_ff = n_ff, @@ -225,8 +214,6 @@ class Params: else: f_rope_scale = None - n_mult = Params.find_n_mult(n_ff, n_embd) - if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] elif "max_position_embeddings" in config: @@ -238,7 +225,6 @@ class Params: return Params( n_vocab = n_vocab, n_embd = n_embd, - n_mult = n_mult, n_layer = n_layer, n_ctx = n_ctx, n_ff = n_ff, @@ -250,7 +236,7 @@ class Params: ) # LLaMA v2 70B params.json - # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) @@ -258,7 +244,6 @@ class Params: n_vocab = config["vocab_size"] if "vocab_size" in config else -1 n_embd = config["dim"] n_layer = config["n_layers"] - n_mult = config["multiple_of"] n_ff = -1 n_head = config["n_heads"] n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head @@ -266,7 +251,7 @@ class Params: f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None # hack to determine LLaMA v1 vs v2 vs CodeLlama - if f_rope_freq_base and f_rope_freq_base == 1000000: + if f_rope_freq_base == 1000000: # CodeLlama n_ctx = 16384 elif config["norm_eps"] == 1e-05: @@ -285,7 +270,6 @@ class Params: return Params( n_vocab = n_vocab, n_embd = n_embd, - n_mult = n_mult, n_layer = n_layer, n_ctx = n_ctx, n_ff = n_ff, @@ -841,9 +825,9 @@ class OutputFile: name = "LLaMA" # TODO: better logic to determine model name - if (params.n_ctx == 4096): + if params.n_ctx == 4096: name = "LLaMA v2" - elif params.path_model: + elif params.path_model is not None: name = str(params.path_model.parent).split('/')[-1] self.gguf.add_name (name) @@ -856,13 +840,13 @@ class OutputFile: self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) - if params.f_rope_freq_base: + if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) - if params.f_rope_scale: + if params.f_rope_scale is not None: self.gguf.add_rope_scale_linear(params.f_rope_scale) - if params.ftype: + if params.ftype is not None: self.gguf.add_file_type(params.ftype) def add_meta_vocab(self, vocab: Vocab) -> None: diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 4d021434b..6b31aea78 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -1,7 +1,3 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "common.h" #include "llama.h" #include "build-info.h" diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 9e856c21a..293b455d0 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" +#include "common.h" #include #include @@ -499,10 +500,10 @@ struct llama_file { errno = 0; std::size_t ret = std::fread(ptr, size, 1, fp); if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); + die_fmt("fread failed: %s", strerror(errno)); } if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); + die("unexpectedly reached end of file"); } } @@ -597,8 +598,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); if (!file.fp) { - fprintf(stderr, "error: %s: %s\n", strerror(errno), filename); - exit(1); + die_fmt("%s: %s", strerror(errno), filename); } const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 036bdb398..ef12212ba 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -1,8 +1,3 @@ -// Defines sigaction on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "embd-input.h" #include @@ -23,7 +18,7 @@ extern "C" { struct MyModel* create_mymodel(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return nullptr; } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 93d583b5c..e4a0a38c8 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -11,17 +11,12 @@ int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } params.embedding = true; - if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); - } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); if (params.seed == LLAMA_DEFAULT_SEED) { @@ -47,6 +42,12 @@ int main(int argc, char ** argv) { return 1; } + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + // print system information { fprintf(stderr, "\n"); diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp index d4b130b25..7f9a1620b 100644 --- a/examples/gptneox-wip/falcon-main.cpp +++ b/examples/gptneox-wip/falcon-main.cpp @@ -953,7 +953,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp index b6cc46c5f..55eba0cdc 100644 --- a/examples/gptneox-wip/gptneox-main.cpp +++ b/examples/gptneox-wip/gptneox-main.cpp @@ -925,7 +925,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 72a025077..dedaa34fd 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -986,7 +986,12 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); // warmup run - test_gen(ctx, 1, 0, t.n_threads); + if (t.n_prompt > 0) { + test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); + } + if (t.n_gen > 0) { + test_gen(ctx, 1, 0, t.n_threads); + } for (int i = 0; i < params.reps; i++) { uint64_t t_start = get_time_ns(); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 9201b53bd..baec6ba12 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,8 +1,3 @@ -// Defines sigaction on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "common.h" #include "console.h" @@ -48,8 +43,9 @@ static bool is_interacting = false; void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, - const std::vector input_tokens, const std::string output, const std::vector output_tokens) { - + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { if (params.logdir.empty()) { return; } @@ -109,7 +105,7 @@ int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } @@ -186,8 +182,10 @@ int main(int argc, char ** argv) { return 1; } - if (params.n_ctx > llama_n_ctx(ctx)) { - LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx); + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); } else if (params.n_ctx < 8) { LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; @@ -303,7 +301,7 @@ int main(int argc, char ** argv) { // debug message about similarity of saved session, if applicable size_t n_matching_session_tokens = 0; - if (session_tokens.size() > 0) { + if (!session_tokens.empty()) { for (llama_token id : session_tokens) { if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { break; @@ -401,7 +399,7 @@ int main(int argc, char ** argv) { LOG_TEE("%s: interactive mode on.\n", __func__); - if (params.antiprompt.size()) { + if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); } @@ -499,7 +497,7 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict - if (embd.size() > 0) { + if (!embd.empty()) { // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. int max_embd_size = n_ctx - 4; @@ -624,7 +622,7 @@ int main(int argc, char ** argv) { LOG("n_past = %d\n", n_past); } - if (embd.size() > 0 && !path_session.empty()) { + if (!embd.empty() && !path_session.empty()) { session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); n_session_consumed = session_tokens.size(); } @@ -695,7 +693,7 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // check for reverse prompt - if (params.antiprompt.size()) { + if (!params.antiprompt.empty()) { std::string last_output; for (auto id : last_tokens) { last_output += llama_token_to_piece(ctx, id); @@ -732,7 +730,7 @@ int main(int argc, char ** argv) { LOG("found EOS token\n"); if (params.interactive) { - if (params.antiprompt.size() != 0) { + if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 843b2ae35..3a1c8c28d 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -655,7 +655,7 @@ int main(int argc, char ** argv) { gpt_params params; params.n_batch = 512; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } @@ -693,9 +693,10 @@ int main(int argc, char ** argv) { return 1; } - if (params.n_ctx > llama_n_ctx(ctx)) { - fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);" - "expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx); + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); } // print system information diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 06ce18f09..6ce03ba7b 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -71,7 +71,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { } // Check if a layer is included/excluded by command line -bool layer_included(const quantize_stats_params params, const std::string & layer) { +bool layer_included(const quantize_stats_params & params, const std::string & layer) { for (const auto& excluded : params.exclude_layers) { if (std::regex_search(layer, std::regex(excluded))) { return false; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 6b13b281d..adbb0bce5 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -141,10 +141,9 @@ int main(int argc, char ** argv) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; - } else { - if (ftype_str == "COPY") { - params.only_copy = true; - } + } + if (ftype_str == "COPY") { + params.only_copy = true; } arg_idx++; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 573bc4ef9..14e9501ca 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -13,7 +13,7 @@ int main(int argc, char ** argv) { params.repeat_last_n = 64; params.prompt = "The quick brown fox"; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } @@ -44,7 +44,7 @@ int main(int argc, char ** argv) { llama_free_model(model); return 1; } - auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true); + auto tokens = llama_tokenize(ctx, params.prompt, true); auto n_prompt_tokens = tokens.size(); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6b606447d..3f3c64650 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -139,7 +139,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c } // convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context *ctx, const std::vector probs) +static json probs_vector_to_json(const llama_context *ctx, const std::vector & probs) { json out = json::array(); for (const auto &prob : probs) @@ -271,7 +271,7 @@ struct llama_server_context return true; } - std::vector tokenize(json json_prompt, bool add_bos) + std::vector tokenize(const json & json_prompt, bool add_bos) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. @@ -611,7 +611,7 @@ struct llama_server_context completion_token_output doCompletion() { - const completion_token_output token_with_probs = nextToken(); + auto token_with_probs = nextToken(); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; @@ -1255,7 +1255,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) { struct token_translator { llama_context * ctx; std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } - std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); } + std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } }; void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 4ee85faca..ba5de0cc6 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,7 +1,3 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "build-info.h" #include "common.h" diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index c6211ac79..822d7b529 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -1,7 +1,3 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "build-info.h" #include "common.h" diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 6fe85d419..947aa7ed3 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -169,10 +169,6 @@ struct my_llama_hparams { float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; - - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(my_llama_hparams)); - } }; struct my_llama_layer { @@ -929,28 +925,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa } } - -#ifdef __GNUC__ -#ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) -#else -__attribute__((format(printf, 1, 2))) -#endif -#endif -static std::string format(const char * fmt, ...) { - va_list ap, ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - int tokenize_file(struct llama_context * lctx, const char * filename, std::vector& out) { FILE * fp = std::fopen(filename, "rb"); if (fp == NULL) { @@ -983,10 +957,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto out.resize(size+1); if (std::fread(buf.data(), size, 1, fp) != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); + die("unexpectedly reached end of file"); } if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); + die_fmt("fread failed: %s", strerror(errno)); } buf[size] = '\0'; @@ -1047,11 +1021,11 @@ void shuffle_ints(int * begin, int * end) { if (kid >= 0) { \ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ if (ktype != (type)) { \ - throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ } \ (dst) = func(ctx, kid); \ } else if (req) { \ - throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ + die_fmt("key not found in model: %s", skey.c_str()); \ } \ } @@ -1136,7 +1110,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); } else { - throw std::runtime_error("unknown optimizer type\n"); + die("unknown optimizer type"); } } @@ -1315,20 +1289,20 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + die("cannot find tokenizer vocab in model file"); } const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx); const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES)); if (score_idx == -1) { - throw std::runtime_error("cannot find tokenizer scores in model file\n"); + die("cannot find tokenizer scores in model file"); } const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx); const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE)); if (toktype_idx == -1) { - throw std::runtime_error("cannot find token type list in GGUF file\n"); + die("cannot find token type list in GGUF file"); } const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx); @@ -1356,7 +1330,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod // read and copy bpe merges const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES)); if (merges_keyidx == -1) { - throw std::runtime_error("cannot find tokenizer merges in model file\n"); + die("cannot find tokenizer merges in model file"); } const int n_merges = gguf_get_arr_n(vctx, merges_keyidx); @@ -1988,7 +1962,7 @@ void opt_callback(void * vdata, float * sched) { float min_sched = params->adam_min_alpha / params->adam_alpha; *sched = min_sched + *sched * (1.0f - min_sched); - int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); + int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f); printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); if (data->shuffle_countdown < n_batch) { diff --git a/ggml-alloc.c b/ggml-alloc.c index d5570ac97..8c8b72798 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -1,8 +1,3 @@ -// defines MAP_ANONYMOUS -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "ggml-alloc.h" #include "ggml.h" #include @@ -138,7 +133,7 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { #ifdef GGML_ALLOCATOR_DEBUG - GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated #endif size_t size = ggml_allocr_get_alloc_size(alloc, tensor); @@ -165,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; - max_avail = MAX(max_avail, block->size); } else { fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", __func__, size, max_avail); GGML_ASSERT(!"not enough space in the buffer"); - return; + return; } } struct free_block * block = &alloc->free_blocks[best_fit_block]; @@ -316,7 +311,11 @@ static void * alloc_vmem(size_t size) { #if defined(_WIN32) return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); #elif defined(_POSIX_MAPPED_FILES) - return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; #else // use a fixed address for other platforms uintptr_t base_addr = (uintptr_t)-size - 0x100; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 93a2342a6..3c3e42b0a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -144,8 +144,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cudaError_t err_ = (err); \ if (err_ != cudaSuccess) { \ - fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + int id; \ + cudaGetDevice(&id); \ + fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ cudaGetErrorString(err_)); \ + fprintf(stderr, "current device: %d\n", id); \ exit(1); \ } \ } while (0) @@ -155,8 +158,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ + int id; \ + cudaGetDevice(&id); \ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ + fprintf(stderr, "current device: %d\n", id); \ exit(1); \ } \ } while (0) @@ -165,7 +171,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ + int id; \ + cudaGetDevice(&id); \ fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + fprintf(stderr, "current device: %d\n", id); \ exit(1); \ } \ } while (0) @@ -212,10 +221,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (*ggml_cuda_op_t)( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, - float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main); +typedef void (*ggml_cuda_op_mul_mat_t)( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream); +typedef void (*ggml_cuda_op_flatten_t)( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream); // QK = number of values after dequantization // QR = QK / number of values before dequantization @@ -396,11 +408,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); #endif +#define MUL_MAT_SRC1_COL_STRIDE 128 + +#define MAX_STREAMS 8 +static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; + struct ggml_tensor_extra_gpu { void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors - cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs + cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs }; +// this is faster on Windows +// probably because the Windows CUDA libraries forget to make this check before invoking the drivers +inline cudaError_t ggml_cuda_set_device(const int device) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + + if (device == current_device) { + return cudaSuccess; + } + + return cudaSetDevice(device); +} + static int g_device_count = -1; static int g_main_device = 0; static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; @@ -413,8 +443,6 @@ static size_t g_scratch_offset = 0; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; - static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -4086,7 +4114,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; } -static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { +static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int half_n_dims = ncols/4; @@ -4098,8 +4127,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol const int i = row*ncols + col; const float col_theta_scale = powf(theta_scale, col); + const float p = p0 + p_delta*(row/p_delta_rows); - const float theta = p*col_theta_scale; + const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale; const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -4109,7 +4139,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; - const float block_theta = block_p*col_theta_scale; + const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale; const float sin_block_theta = sinf(block_theta); const float cos_block_theta = cosf(block_theta); @@ -4984,12 +5014,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co rope_neox_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); } -static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { - GGML_ASSERT(nrows % 4 == 0); - const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1); - const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE); +static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { + GGML_ASSERT(ncols % 4 == 0); + const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); + const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; const dim3 block_nums(num_blocks_x, nrows, 1); - rope_glm_f32<<>>(x, dst, ncols, p, block_p, theta_scale); + rope_glm_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx); } static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, @@ -5123,25 +5154,27 @@ void ggml_init_cublas() { GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; } - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } - for (int id = 0; id < g_device_count; ++id) { - CUDA_CHECK(cudaSetDevice(id)); + for (int64_t id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); - // create main stream - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking)); + // create cuda streams + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking)); + } // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); @@ -5249,225 +5282,169 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } inline void ggml_cuda_op_add( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); - - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - // compute if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); + add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main); + add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); } else { GGML_ASSERT(false); } (void) src1; (void) dst; - (void) src0_ddq_i; - (void) i02; - (void) i1; } inline void ggml_cuda_op_mul( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); - - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); + mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); (void) dst; - (void) src0_ddq_i; - (void) i02; - (void) i1; } inline void ggml_cuda_op_gelu( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; - - // compute - gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_silu( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; - - // compute - silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_norm( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); - // compute - norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_rms_norm( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); float eps; memcpy(&eps, dst->op_params, sizeof(float)); - // compute - rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main); + rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_mul_mat_q( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ - - GGML_ASSERT(src0_ddq_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { const int64_t ne00 = src0->ne[0]; const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne0 = dst->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t row_diff = row_high - row_low; int id; CUDA_CHECK(cudaGetDevice(&id)); // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; - - const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ? - ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; - size_t as; - void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main); + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: - ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q4_1: - ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q5_0: - ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q5_1: - ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q8_0: - ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q2_K: - ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q3_K: - ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q4_K: - ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q5_K: - ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q6_K: - ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; default: GGML_ASSERT(false); break; } - ggml_cuda_pool_free(src1_q8_1, as); - (void) src1; (void) dst; - (void) src0_ddf_i; - (void) i02; - (void) i1; + (void) src1_ddf_i; } static int64_t get_row_rounding(ggml_type type) { @@ -5501,168 +5478,144 @@ static int64_t get_row_rounding(ggml_type type) { } } -inline void ggml_cuda_op_mul_mat_vec( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ - - GGML_ASSERT(src0_ddq_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); +inline void ggml_cuda_op_mul_mat_vec_q( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { const int64_t ne00 = src0->ne[0]; - const int64_t nrows = i01_high - i01_low; + const int64_t row_diff = row_high - row_low; -#ifdef GGML_CUDA_FORCE_DMMV - const bool use_mul_mat_vec_q = false; - (void) g_compute_capabilities[0]; -#else - int id; - CUDA_CHECK(cudaGetDevice(&id)); - - bool mul_mat_vec_q_implemented = - src0->type == GGML_TYPE_Q4_0 || - src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || - src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0; -#if QK_K == 256 - mul_mat_vec_q_implemented = mul_mat_vec_q_implemented || - src0->type == GGML_TYPE_Q2_K || - src0->type == GGML_TYPE_Q3_K || - src0->type == GGML_TYPE_Q4_K || - src0->type == GGML_TYPE_Q5_K || - src0->type == GGML_TYPE_Q6_K; -#endif // QK_K == 256 - - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented; -#endif - - if (use_mul_mat_vec_q) { - const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ? - ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; - size_t as; - void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main); - - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - default: - GGML_ASSERT(false); - break; - } - - ggml_cuda_pool_free(src1_q8_1, as); - } else { - // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics -#ifdef GGML_CUDA_F16 - size_t ash; - dfloat * src1_dfloat = nullptr; // dfloat == half - - bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; - - if (src1_convert_f16) { - src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); - ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00, - ne00, 1, sizeof(float), 0, 0, - ne00, 1, sizeof(half), 0, 0, cudaStream_main); - } -#else - dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion -#endif // GGML_CUDA_F16 - - switch (src0->type) { - case GGML_TYPE_Q4_0: - dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_1: - dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_0: - dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_1: - dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q8_0: - dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_K: - dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_F16: - convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - default: - GGML_ASSERT(false); - break; - } - -#ifdef GGML_CUDA_F16 - if (src1_convert_f16) { - ggml_cuda_pool_free(src1_dfloat, ash); - } -#endif // GGML_CUDA_F16 + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; } (void) src1; (void) dst; - (void) src0_ddf_i; - (void) i02; - (void) i1; + (void) src1_ddf_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} + +inline void ggml_cuda_op_dequantize_mul_mat_vec( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + size_t ash; + dfloat * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + +#ifdef GGML_CUDA_F16 + if (src1_convert_f16) { + ggml_cuda_pool_free(src1_dfloat, ash); + } +#endif // GGML_CUDA_F16 + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; } inline void ggml_cuda_op_mul_mat_cublas( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { - GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src0_dd_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); const float alpha = 1.0f; const float beta = 0.0f; @@ -5670,43 +5623,48 @@ inline void ggml_cuda_op_mul_mat_cublas( const int64_t ne00 = src0->ne[0]; const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; const int64_t ne0 = dst->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t row_diff = row_high - row_low; + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + size_t src0_as; + float * src0_ddf_i = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); + to_fp32_cuda(src0_dd_i, src0_ddf_i, row_diff*ne00, stream); int id; CUDA_CHECK(cudaGetDevice(&id)); // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main)); + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); CUBLAS_CHECK( cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, - i01_diff, ne11, ne10, + row_diff, src1_ncols, ne10, &alpha, src0_ddf_i, ne00, - src1_ddf_i, ne10, - &beta, dst_ddf_i, ldc)); + src1_ddf_i, ne10, + &beta, dst_dd_i, ldc)); + + ggml_cuda_pool_free(src0_ddf_i, src0_as); (void) dst; - (void) src0_ddq_i; - (void) i02; - (void) i1; + (void) src0_dd_i; + (void) src1_ddq_i; + (void) src1_padded_row_size; } inline void ggml_cuda_op_rope( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -5719,44 +5677,37 @@ inline void ggml_cuda_op_rope( memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; const bool is_neox = mode & 2; const bool is_glm = mode & 4; // compute if (is_glm) { - const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale; - const float id_p = min(p, n_ctx - 2.f); - const float block_p = max(p - (n_ctx - 2.f), 0.f); - rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream); } else if (is_neox) { GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); - const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; - rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); + rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); } else { - const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); + rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); } (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_alibi( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; @@ -5771,334 +5722,355 @@ inline void ggml_cuda_op_alibi( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - // compute - alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main); + alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream); (void) src1; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_diag_mask_inf( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t i01_diff = i01_high - i01_low; + const int nrows0 = ggml_nrows(src0); const int n_past = ((int32_t *) dst->op_params)[0]; - // compute - diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); + diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_soft_max( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); - // compute - soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_scale( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const float scale = ((float *) src1->data)[0]; - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; - - // compute - scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main); + scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } -static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) { +static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) { + const int64_t nrows0 = ggml_nrows(src0); + + const bool use_src1 = src1 != nullptr; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT( src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + + const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE; + + // dd = data device + float * src0_ddf = nullptr; + float * src1_ddf = nullptr; + float * dst_ddf = nullptr; + + // as = actual size + size_t src0_asf = 0; + size_t src1_asf = 0; + size_t dst_asf = 0; + + ggml_cuda_set_device(g_main_device); + const cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + if (src0_on_device) { + src0_ddf = (float *) src0_extra->data_device[g_main_device]; + } else { + src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); + } + + if (use_src1 && !src1_stays_on_host) { + if (src1_on_device) { + src1_ddf = (float *) src1_extra->data_device[g_main_device]; + } else { + src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); + } + } + if (dst_on_device) { + dst_ddf = (float *) dst_extra->data_device[g_main_device]; + } else { + dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf); + } + + // do the computation + op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + CUDA_CHECK(cudaGetLastError()); + + // copy dst to host if necessary + if (!dst_on_device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); + } + + if (src0_asf > 0) { + ggml_cuda_pool_free(src0_ddf, src0_asf); + } + if (src1_asf > 0) { + ggml_cuda_pool_free(src1_ddf, src1_asf); + } + if (dst_asf > 0) { + ggml_cuda_pool_free(dst_ddf, dst_asf); + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaDeviceSynchronize()); + } +} + +static void ggml_cuda_op_mul_mat( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, + const bool convert_src1_to_q8_1) { + const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; const int64_t nrows0 = ggml_nrows(src0); - const bool use_src1 = src1 != nullptr; - const int64_t ne10 = use_src1 ? src1->ne[0] : 1; - const int64_t ne11 = use_src1 ? src1->ne[1] : 1; - const int64_t ne12 = use_src1 ? src1->ne[2] : 1; - const int64_t ne13 = use_src1 ? src1->ne[3] : 1; - const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + const int64_t nrows1 = ggml_nrows(src1); GGML_ASSERT(ne03 == ne13); const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); - // strides for iteration over dims 3 and 2 - const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13; - const int64_t num_iters = flatten_rows ? 1 : num_iters_0; - const int64_t stride_mod = flatten_rows ? num_iters_0 : 1; - const int64_t src0_stride = ne00 * ne01 * stride_mod; - const int64_t src1_stride = ne10 * ne11 * stride_mod; - const int64_t dst_stride = ne0 * ne1 * stride_mod; + GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); - const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; - const int64_t i03_max = flatten_rows ? 1 : ne03; - const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12); - const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02; - GGML_ASSERT(!(flatten_rows && ne02 < ne12)); + const int64_t i02_divisor = ne12 / ne02; const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type); + const size_t q8_1_ts = sizeof(block_q8_1); + const size_t q8_1_bs = QK8_1; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); - const bool src0_is_f32 = src0->type == GGML_TYPE_F32; - const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1); - const bool src1_stays_on_host = use_src1 && ( - dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); + const bool src1_is_contiguous = ggml_is_contiguous(src1); + const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ? + ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 > 1)); + GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); - // dd = data device - char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized - float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float - float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; - float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; + char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float + char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1 + float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; - // asq = actual size quantized, asf = actual size float - size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0}; - size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0}; + // as = actual size + size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0}; size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; - size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0}; - // if multiple devices are used they need to wait for the main device - // here an event is recorded that signifies that the main device has finished calculating the input data - if (split && g_device_count > 1) { - CUDA_CHECK(cudaSetDevice(g_main_device)); - CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device])); - } + int64_t row_low[GGML_CUDA_MAX_DEVICES]; + int64_t row_high[GGML_CUDA_MAX_DEVICES]; - for (int id = 0; id < g_device_count; ++id) { - if (!split && id != g_main_device) { - continue; - } + for (int64_t id = 0; id < g_device_count; ++id) { + // by default, use all rows + row_low[id] = 0; + row_high[id] = ne01; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; - - int64_t row_low, row_high; + // for multi GPU, get the row boundaries from tensor split + // and round to mul_mat_q tile sizes if (split) { const int64_t rounding = get_row_rounding(src0->type); - row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; - row_low -= row_low % rounding; - - if (id == g_device_count - 1) { - row_high = nrows0; - } else { - row_high = nrows0*g_tensor_split[id + 1]; - row_high -= row_high % rounding; + if (id != 0) { + row_low[id] = ne01*g_tensor_split[id]; + row_low[id] -= row_low[id] % rounding; + } + + if (id != g_device_count - 1) { + row_high[id] = ne01*g_tensor_split[id + 1]; + row_high[id] -= row_high[id] % rounding; } - } else { - row_low = 0; - row_high = nrows0*i02_divisor; } - if (row_low == row_high) { + } + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { continue; } - int64_t row_diff = row_high - row_low; + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; - cudaSetDevice(id); - cudaStream_t cudaStream_main = g_cudaStreams_main[id]; - - // wait for main GPU data if necessary - if (split && id != g_main_device) { - CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device])); - } + ggml_cuda_set_device(id); + const cudaStream_t stream = g_cudaStreams[id][0]; if (src0_on_device && src0_is_contiguous) { - if (src0_is_f32) { - src0_ddf[id] = (float *) src0_extra->data_device[id]; - } else { - src0_ddq[id] = (char *) src0_extra->data_device[id]; - } + src0_dd[id] = (char *) src0_extra->data_device[id]; } else { - if (src0_is_f32) { - src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); - } else { - src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]); + const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); + src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); + } + + if (src1_on_device && src1_is_contiguous) { + src1_ddf[id] = (float *) src1_extra->data_device[id]; + } else { + src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]); + } + + if (convert_src1_to_q8_1) { + src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); + + if (split && src1_on_device && src1_is_contiguous) { + quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); + CUDA_CHECK(cudaGetLastError()); } } - if (src0_needs_f32 && !src0_is_f32) { - src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); - } - - if (use_src1 && !src1_stays_on_host) { - if (src1_on_device && src1_is_contiguous) { - src1_ddf[id] = (float *) src1_extra->data_device[id]; - } else { - src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]); - } - } if (dst_on_device) { - dst_ddf[id] = (float *) dst_extra->data_device[id]; + dst_dd[id] = (float *) dst_extra->data_device[id]; } else { - size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float); - dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); + const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); + dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]); } + } - for (int64_t i03 = 0; i03 < i03_max; i03++) { - const int64_t i13 = i03 % ne13; - for (int64_t i02 = 0; i02 < i02_max; i02++) { - const int64_t i12 = i02 % ne12; + // if multiple devices are used they need to wait for the main device + // here an event is recorded that signals that the main device has finished calculating the input data + if (split && g_device_count > 1) { + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); + } - const int64_t i0 = i03*i02_max + i02; + const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; + for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { + const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; + const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; - // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs - const int64_t i0_offset_low = row_low/rows_per_iter; - const int64_t i0_offset_high = row_high/rows_per_iter; + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } - int64_t i01_low = 0; - int64_t i01_high = rows_per_iter; - if (split) { - if (i0 < i0_offset_low || i0 > i0_offset_high) { - continue; - } - if (i0 == i0_offset_low) { - i01_low = row_low % rows_per_iter; - } - if (i0 == i0_offset_high) { - i01_high = row_high % rows_per_iter; - } - } + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const int64_t row_diff = row_high[id] - row_low[id]; - // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables. - // Removing the first assert or changing the order of the arguments causes the second assert to fail. - // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output. - // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU). - GGML_ASSERT(i01_low == 0 || g_device_count > 1); - GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1); + ggml_cuda_set_device(id); + const cudaStream_t stream = g_cudaStreams[id][is]; - const int64_t i01_diff = i01_high - i01_low; - if (i01_diff == 0) { - continue; - } - const int64_t i11 = i13*ne12 + i12; + // wait for main GPU data if necessary + if (split && (id != g_main_device || is != 0)) { + CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0])); + } + + for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { + const int64_t i03 = i0 / ne12; + const int64_t i02 = i0 % ne12; + + const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; // for split tensors the data begins at i0 == i0_offset_low - char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs; - float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride; - float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; - float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; - - // for split tensors the data pointer needs to be rounded down - // to the bin edge for i03, i02 bins beyond the first - if (i0 - i0_offset_low > 0) { - GGML_ASSERT(!flatten_rows); - src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs; - src0_ddf_i -= (row_low % ne01)*ne00; - dst_ddf_i -= (row_low % ne0)*ne1; - } + char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs; + float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10; + char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset; + float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { - dst_ddf_i += i01_low; // offset is 0 if no tensor split + dst_dd_i += row_low[id]; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (use_src1 && !src1_stays_on_host) { - if (src1->backend == GGML_BACKEND_CPU) { - GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1)); - int64_t nrows1 = flatten_rows ? nrows0 : ne11; - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main)); - } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { - if (id != g_main_device) { - GGML_ASSERT(!flatten_rows); + if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (id != g_main_device) { + if (convert_src1_to_q8_1) { + char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset; + CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, + cudaMemcpyDeviceToDevice, stream)); + } else { float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; - src1_ddf_i_source += i11*src1_stride; - CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), - cudaMemcpyDeviceToDevice, cudaStream_main)); + src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; + CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float), + cudaMemcpyDeviceToDevice, stream)); } - } else if (src1_on_device && !src1_is_contiguous) { - GGML_ASSERT(!split); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main)); - } else { - GGML_ASSERT(false); } + } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d( + src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); + } else { + GGML_ASSERT(false); } - if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { - if (src0_is_f32) { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); - } else { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); - } - } - - // convert src0 to f32 if it is necessary for the ggml_cuda_op - if (src0_needs_f32 && !src0_is_f32) { - to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main); + if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) { + quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } + if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream)); + } + // do the computation - op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main); + op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); // copy dst to host or other device if necessary @@ -6120,95 +6092,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. // Instead they need to be copied to the correct slice in ne0 = dst row index. // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. - float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float), - i01_diff*sizeof(float), ne1, kind, cudaStream_main)); + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0 + row_low[id]; + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float), + row_diff*sizeof(float), src1_ncols, kind, stream)); } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main)); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0; + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream)); } } - // signify to main device that other device is done - if (split && g_device_count > 1 && id != g_main_device) { - CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main)); + // add event for the main device to wait on until other device is done + if (split && (id != g_main_device || is != 0)) { + CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream)); } } } } - // wait until each device is finished, then free their buffers - for (int id = 0; id < g_device_count; ++id) { - if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) { - continue; - } + for (int64_t id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); - CUDA_CHECK(cudaSetDevice(id)); - - if (src0_asq[id] > 0) { - ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]); - } - if (src0_asf[id] > 0) { - ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]); + // free buffers again when done + if (src0_as[id] > 0) { + ggml_cuda_pool_free(src0_dd[id], src0_as[id]); } if (src1_asf[id] > 0) { ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); } - if (dst_asf[id] > 0) { - ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]); + if (src1_asq[id] > 0) { + ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]); + } + if (dst_as[id] > 0) { + ggml_cuda_pool_free(dst_dd[id], dst_as[id]); } } // main device waits for all other devices to be finished if (split && g_device_count > 1) { - CUDA_CHECK(cudaSetDevice(g_main_device)); - for (int id = 0; id < g_device_count; ++id) { - if (id != g_main_device && src0_extra->events[id]) { - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id])); + int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; + is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + for (int64_t id = 0; id < g_device_count; ++id) { + for (int64_t is = 0; is < is_max; ++is) { + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is])); } } } if (dst->backend == GGML_BACKEND_CPU) { - CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(cudaDeviceSynchronize()); } } void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op. - // Due to flatten_rows == true this does in practice not make a difference however. - // Better solution would be nice but right now that would require disproportionate changes. - GGML_ASSERT( - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && - src1->type == GGML_TYPE_F32 && - (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16)); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); } void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); } void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); } void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); } void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); } void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { @@ -6242,8 +6205,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne12 = src1->ne[2]; - CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -6254,7 +6217,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main); + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -6273,8 +6236,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; - CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -6285,38 +6248,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - const int row_stride_x = nb01 / sizeof(half); - const int channel_stride_x = nb02 / sizeof(half); + const int64_t row_stride_x = nb01 / sizeof(half); + const int64_t channel_stride_x = nb02 / sizeof(half); - ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main); + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; + int64_t min_compute_capability = INT_MAX; + for (int64_t id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_compute_capabilities[id]; + } + } + if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_p021(src0, src1, dst); } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_nc(src0, src1, dst); }else if (src0->type == GGML_TYPE_F32) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false); - } else { - int min_compute_capability = INT_MAX; - for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_compute_capabilities[id] - && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - min_compute_capability = g_compute_capabilities[id]; - } - } - if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false); +#ifdef GGML_CUDA_FORCE_DMMV + const bool use_mul_mat_vec_q = false; +#else + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); +#endif // GGML_CUDA_FORCE_DMMV + + if (use_mul_mat_vec_q) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); } else { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + } + } else { + if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); + } else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } } } else { @@ -6325,8 +6299,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ } void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); } void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6355,8 +6328,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens const int64_t nb11 = src1->nb[1]; const int64_t nb12 = src1->nb[2]; - CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; @@ -6366,10 +6339,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, cudaStream_main); + ne10, ne11, nb10, nb11, nb12, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, cudaStream_main); + ne10, ne11, nb10, nb11, nb12, main_stream); } else { GGML_ASSERT(false); } @@ -6383,28 +6356,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens } void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); } void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); } void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented - - const int mode = ((int32_t *) dst->op_params)[2]; - const bool is_glm = mode & 4; - - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); } void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); } void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6414,7 +6379,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens } void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { - int nrows = ggml_nrows(tensor); + const int64_t nrows = ggml_nrows(tensor); const int64_t ne0 = tensor->ne[0]; @@ -6424,14 +6389,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { if (backend == GGML_BACKEND_GPU && id != g_main_device) { continue; } - cudaSetDevice(id); + ggml_cuda_set_device(id); - int row_low, row_high; + int64_t row_low, row_high; if (backend == GGML_BACKEND_GPU) { row_low = 0; row_high = nrows; @@ -6481,7 +6446,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { extra->data_device[id] = buf; if (backend == GGML_BACKEND_GPU_SPLIT) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming)); + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + } } } @@ -6495,15 +6462,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaSetDevice(id)); + CUDA_CHECK(ggml_cuda_set_device(id)); CUDA_CHECK(cudaFree(extra->data_device[id])); } - if (extra->events[id] != nullptr) { - CUDA_CHECK(cudaSetDevice(id)); - CUDA_CHECK(cudaEventDestroy(extra->events[id])); + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } } } @@ -6555,7 +6524,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo force_inplace; const size_t size = ggml_nbytes(tensor); - CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; diff --git a/ggml-metal.m b/ggml-metal.m index d0d23442e..4f3f14e24 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -63,7 +63,9 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(gelu); GGML_METAL_DECL_KERNEL(soft_max); + GGML_METAL_DECL_KERNEL(soft_max_4); GGML_METAL_DECL_KERNEL(diag_mask_inf); + GGML_METAL_DECL_KERNEL(diag_mask_inf_8); GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_1); @@ -77,6 +79,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); @@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal"; struct ggml_metal_context * ggml_metal_init(int n_cb) { metal_printf("%s: allocating\n", __func__); - // Show all the Metal device instances in the system - NSArray * devices = MTLCopyAllDevices(); id device; NSString * s; + +#if TARGET_OS_OSX + // Show all the Metal device instances in the system + NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); } +#endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); @@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); -#if 0 - // compile from source string and show compile log +#ifdef GGML_SWIFT + // load the default.metallib file { NSError * error = nil; - ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; + NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; + NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"]; + NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath]; + NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"]; + NSURL * libURL = [NSURL fileURLWithPath:libPath]; + + // Load the metallib file into a Metal library + ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + if (error) { metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; @@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(gelu); GGML_METAL_ADD_KERNEL(soft_max); + GGML_METAL_ADD_KERNEL(soft_max_4); GGML_METAL_ADD_KERNEL(diag_mask_inf); + GGML_METAL_ADD_KERNEL(diag_mask_inf_8); GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_1); @@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); @@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); +#if TARGET_OS_OSX + metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); } +#endif return ctx; } @@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(relu); GGML_METAL_DEL_KERNEL(gelu); GGML_METAL_DEL_KERNEL(soft_max); - GGML_METAL_DEL_KERNEL(diag_mask_inf); + GGML_METAL_DEL_KERNEL(soft_max_4); + GGML_METAL_DEL_KERNEL(diag_mask_inf_8); GGML_METAL_DEL_KERNEL(get_rows_f16); GGML_METAL_DEL_KERNEL(get_rows_q4_0); GGML_METAL_DEL_KERNEL(get_rows_q4_1); @@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4); GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); @@ -327,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { void * ggml_metal_host_malloc(size_t n) { void * data = NULL; - const int result = posix_memalign((void **) &data, getpagesize(), n); + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { metal_printf("%s: error: posix_memalign failed\n", __func__); return NULL; @@ -401,7 +422,7 @@ bool ggml_metal_add_buffer( } } - const size_t size_page = getpagesize(); + const size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { @@ -454,6 +475,7 @@ bool ggml_metal_add_buffer( } } +#if TARGET_OS_OSX metal_printf(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); @@ -463,6 +485,9 @@ bool ggml_metal_add_buffer( } else { metal_printf("\n"); } +#else + metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); +#endif } return true; @@ -750,7 +775,7 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; @@ -762,7 +787,7 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; @@ -782,7 +807,7 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; @@ -796,13 +821,16 @@ void ggml_metal_graph_compute( { const int nth = 32; - [encoder setComputePipelineState:ctx->pipeline_soft_max]; + if (ne00%4 == 0) { + [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; + } else { + [encoder setComputePipelineState:ctx->pipeline_soft_max]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -810,14 +838,23 @@ void ggml_metal_graph_compute( { const int n_past = ((int32_t *)(dst->op_params))[0]; - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + if (ne00%8 == 0) { + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8]; + } else { + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; - [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + if (ne00%8 == 0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + else { + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } } break; case GGML_OP_MUL_MAT: { @@ -864,6 +901,7 @@ void ggml_metal_graph_compute( } else { int nth0 = 32; int nth1 = 1; + int nrows = 1; // use custom matrix x vector kernel switch (src0t) { @@ -873,8 +911,12 @@ void ggml_metal_graph_compute( nth1 = 1; if (ne11 * ne12 < 4) { [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4]; + nrows = ne11; } else { [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + nrows = 4; } } break; case GGML_TYPE_Q4_0: @@ -995,7 +1037,7 @@ void ggml_metal_graph_compute( else if (src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { - int64_t ny = (ne11 + 3)/4; + int64_t ny = (ne11 + nrows - 1)/nrows; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } @@ -1141,7 +1183,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; case GGML_OP_DUP: case GGML_OP_CPY: diff --git a/ggml-metal.metal b/ggml-metal.metal index 119fcbeb6..f45b1490f 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -63,18 +63,18 @@ kernel void kernel_mul_row( } kernel void kernel_scale( - device const float * src0, - device float * dst, + device const float4 * src0, + device float4 * dst, constant float & scale, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * scale; } kernel void kernel_silu( - device const float * src0, - device float * dst, + device const float4 * src0, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { - float x = src0[tpig]; + device const float4 & x = src0[tpig]; dst[tpig] = x / (1.0f + exp(-x)); } @@ -89,10 +89,10 @@ constant float GELU_COEF_A = 0.044715f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; kernel void kernel_gelu( - device const float * src0, - device float * dst, + device const float4 * src0, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { - float x = src0[tpig]; + device const float4 & x = src0[tpig]; // BEWARE !!! // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! @@ -107,7 +107,6 @@ kernel void kernel_soft_max( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, - threadgroup float * buf [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -119,64 +118,70 @@ kernel void kernel_soft_max( device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - buf[tpitg[0]] = -INFINITY; - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { - buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]); + float lmax = psrc0[tpitg[0]]; + for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) { + lmax = MAX(lmax, psrc0[i00]); } - - // reduce - threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = ntg[0]/2; i > 0; i /= 2) { - if (tpitg[0] < i) { - buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]); - } - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of - // the loop, and when that is done, buf[0] has the correct (synchronized) value - //if (tpitg[0] == 0) { - // buf[0] = buf[0]; - //} - - //threadgroup_barrier(mem_flags::mem_threadgroup); - - const float max = buf[0]; + const float max = simd_max(lmax); // parallel sum - buf[tpitg[0]] = 0.0f; + float lsum = 0.0f; for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { const float exp_psrc0 = exp(psrc0[i00] - max); - buf[tpitg[0]] += exp_psrc0; + lsum += exp_psrc0; // Remember the result of exp here. exp is expensive, so we really do not // whish to compute it twice. pdst[i00] = exp_psrc0; } - // reduce - threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = ntg[0]/2; i > 0; i /= 2) { - if (tpitg[0] < i) { - buf[tpitg[0]] += buf[tpitg[0] + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - } - - // broadcast - not needed, see above - //// broadcast - //if (tpitg[0] == 0) { - // buf[0] = buf[0]; - //} - - //threadgroup_barrier(mem_flags::mem_threadgroup); - - const float sum = buf[0]; + const float sum = simd_sum(lsum); for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { pdst[i00] /= sum; } } +kernel void kernel_soft_max_4( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + + // parallel max + float4 lmax4 = psrc4[tpitg[0]]; + for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) { + lmax4 = fmax(lmax4, psrc4[i00]); + } + float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); + + const float max = simd_max(lmax); + + // parallel sum + float4 lsum4 = 0.0f; + for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + const float4 exp_psrc4 = exp(psrc4[i00] - max); + lsum4 += exp_psrc4; + pdst4[i00] = exp_psrc4; + } + float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; + + const float sum = simd_sum(lsum); + + for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + pdst4[i00] /= sum; + } +} + kernel void kernel_diag_mask_inf( device const float * src0, device float * dst, @@ -192,6 +197,33 @@ kernel void kernel_diag_mask_inf( dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; } else { dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; + } +} + +kernel void kernel_diag_mask_inf_8( + device const float4 * src0, + device float4 * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int & n_past, + uint3 tpig[[thread_position_in_grid]]) { + + const int64_t i = 2*tpig[0]; + + dst[i+0] = src0[i+0]; + dst[i+1] = src0[i+1]; + int64_t i4 = 4*i; + const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01; + const int64_t i01 = i4/(ne00); i4 -= i01*ne00; + const int64_t i00 = i4; + for (int k = 3; k >= 0; --k) { + if (i00 + 4 + k <= n_past + i01) { + break; + } + dst[i+1][k] = -INFINITY; + if (i00 + k > n_past + i01) { + dst[i][k] = -INFINITY; + } } } @@ -220,14 +252,10 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - //// broadcast - //if (tpitg == 0) { - // sum[0] /= ne00; - //} - //threadgroup_barrier(mem_flags::mem_threadgroup); - const float mean = sum[0]; + const float mean = sum[0] / ne00; // recenter and VARIANCE + threadgroup_barrier(mem_flags::mem_threadgroup); device float * y = dst + tgpig*ne00; sum[tpitg] = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -235,12 +263,6 @@ kernel void kernel_norm( sum[tpitg] += y[i00] * y[i00]; } - //// VARIANCE - //// parallel sum - //sum[tpitg] = 0.0f; - //for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - // sum[tpitg] += y[i00] * y[i00]; - //} // reduce threadgroup_barrier(mem_flags::mem_threadgroup); for (uint i = ntg/2; i > 0; i /= 2) { @@ -249,12 +271,7 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - //// broadcast - //if (tpitg == 0) { - // sum[0] /= ne00; - //} - //threadgroup_barrier(mem_flags::mem_threadgroup); - const float variance = sum[0]; + const float variance = sum[0] / ne00; const float scale = 1.0f/sqrt(variance + eps); for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -262,7 +279,6 @@ kernel void kernel_norm( } } - kernel void kernel_rms_norm( device const void * src0, device float * dst, @@ -630,7 +646,49 @@ kernel void kernel_mul_mat_f16_f32( } } } +} +// Assumes row size (ne00) is a multiple of 4 +kernel void kernel_mul_mat_f16_f32_l4( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int nrows = ne11; + const int64_t r0 = tgpig.x; + const int64_t im = tgpig.z; + + device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + + for (int r1 = 0; r1 < nrows; ++r1) { + device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } } kernel void kernel_alibi_f32( @@ -699,25 +757,27 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, - uint3 tpig[[thread_position_in_grid]]) { - const int64_t i3 = tpig[2]; - const int64_t i2 = tpig[1]; - const int64_t i1 = tpig[0]; + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int64_t i3 = tgpig[2]; + const int64_t i2 = tgpig[1]; + const int64_t i1 = tgpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = freq_scale * (float)p; + const float theta_0 = freq_scale * (float)p; + const float inv_ndims = -1.f/n_dims; if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*i0); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -729,12 +789,12 @@ kernel void kernel_rope( } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - const int64_t i0 = ib*n_dims + ic/2; device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -1138,31 +1198,40 @@ kernel void kernel_mul_mat_q3_K_f32( device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; - float yl[16]; + float yl[32]; - const uint16_t kmask1 = 0x0303; + const uint16_t kmask1 = 0x3030; const uint16_t kmask2 = 0x0f0f; - const int tid = tiisg/2; - const int ix = tiisg%2; - const int ip = tid/8; // 0 or 1 - const int il = tid/2 - 4*ip; // 0...3 + const int tid = tiisg/4; + const int ix = tiisg%4; + const int ip = tid/4; // 0 or 1 + const int il = 2*((tid%4)/2); // 0 or 2 const int ir = tid%2; const int n = 8; const int l0 = n*ir; - const uint16_t m1 = 1 << (4*ip + il); - const uint16_t m2 = m1 << 8; + // One would think that the Metal compiler would figure out that ip and il can only have + // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it + // with these two tales. + // + // Possible masks for the high bit + const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0 + {0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2 + {0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0 + {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2 + + // Possible masks for the low 2 bits + const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}}; + + const ushort4 hm = mm[2*ip + il/2]; const int shift = 2*il; - const uint16_t qm1 = 0x0003 << shift; - const uint16_t qm2 = 0x0300 << shift; - const int32_t v1 = 4 << shift; - const int32_t v2 = 1024 << shift; + const float v1 = il == 0 ? 4.f : 64.f; + const float v2 = 4.f * v1; const uint16_t s_shift1 = 4*ip; - const uint16_t s_shift2 = s_shift1 + 2*(il/2); - const int ik = 4 + (il%2); + const uint16_t s_shift2 = s_shift1 + il; const int q_offset = 32*ip + l0; const int y_offset = 128*ip + 32*il + l0; @@ -1171,12 +1240,19 @@ kernel void kernel_mul_mat_q3_K_f32( device const float * y1 = yy + ix*QK_K + y_offset; - float sumf1[2] = {0.f}, sumf2[2] = {0.f}; - for (int i = ix; i < nb; i += 2) { + uint32_t scales32, aux32; + thread uint16_t * scales16 = (thread uint16_t *)&scales32; + thread const int8_t * scales = (thread const int8_t *)&scales32; + + float sumf1[2] = {0.f}; + float sumf2[2] = {0.f}; + for (int i = ix; i < nb; i += 4) { for (int l = 0; l < 8; ++l) { - yl[l+0] = y1[l+ 0]; - yl[l+8] = y1[l+16]; + yl[l+ 0] = y1[l+ 0]; + yl[l+ 8] = y1[l+16]; + yl[l+16] = y1[l+32]; + yl[l+24] = y1[l+48]; } device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset); @@ -1187,27 +1263,43 @@ kernel void kernel_mul_mat_q3_K_f32( for (int row = 0; row < 2; ++row) { const float d_all = (float)dh[0]; - const char2 scales = as_type((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4))); - float s1 = 0, s2 = 0; - for (int l = 0; l < n; l += 2) { - const uint16_t qs = q[l/2]; - s1 += yl[l+0] * ((int32_t)(qs & qm1) - ((h[l/2] & m1) ? 0 : v1)); - s2 += yl[l+1] * ((int32_t)(qs & qm2) - ((h[l/2] & m2) ? 0 : v2)); - } - float d = d_all * (s1 + 1.f/256.f * s2); - sumf1[row] += d * scales[0]; - sumf2[row] += d; + scales16[0] = a[4]; + scales16[1] = a[5]; + aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030; + scales16[0] = a[il+0]; + scales16[1] = a[il+1]; + scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32; - s1 = s2 = 0; + float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0; for (int l = 0; l < n; l += 2) { - const uint16_t qs = q[l/2+8]; - s1 += yl[l+8] * ((int32_t)(qs & qm1) - ((h[l/2+8] & m1) ? 0 : v1)); - s2 += yl[l+9] * ((int32_t)(qs & qm2) - ((h[l/2+8] & m2) ? 0 : v2)); + const int32_t qs = q[l/2]; + s1 += yl[l+0] * (qs & qm[il/2][0]); + s2 += yl[l+1] * (qs & qm[il/2][1]); + s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]); + s4 += yl[l+16] * (qs & qm[il/2][2]); + s5 += yl[l+17] * (qs & qm[il/2][3]); + s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]); } - d = d_all * (s1 + 1.f/256.f * s2); - sumf1[row] += d * scales[1]; - sumf2[row] += d; + float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); + float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); + sumf1[row] += d1 * (scales[0] - 32); + sumf2[row] += d2 * (scales[2] - 32); + + s1 = s2 = s3 = s4 = s5 = s6 = 0; + for (int l = 0; l < n; l += 2) { + const int32_t qs = q[l/2+8]; + s1 += yl[l+8] * (qs & qm[il/2][0]); + s2 += yl[l+9] * (qs & qm[il/2][1]); + s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]); + s4 += yl[l+24] * (qs & qm[il/2][2]); + s5 += yl[l+25] * (qs & qm[il/2][3]); + s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]); + } + d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); + d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); + sumf1[row] += d1 * (scales[1] - 32); + sumf2[row] += d2 * (scales[3] - 32); q += step; h += step; @@ -1216,17 +1308,20 @@ kernel void kernel_mul_mat_q3_K_f32( } - y1 += 2 * QK_K; + y1 += 4 * QK_K; } for (int row = 0; row < 2; ++row) { - const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift); - const float tot = simd_sum(sumf); - if (tiisg == 0) { - dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; + const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift); + sumf1[row] = simd_sum(sumf); + } + if (tiisg == 0) { + for (int row = 0; row < 2; ++row) { + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row]; } } + } #else kernel void kernel_mul_mat_q3_K_f32( @@ -1579,17 +1674,25 @@ kernel void kernel_mul_mat_q5_K_f32( sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); - float4 acc = {0.f, 0.f, 0.f, 0.f}; + float4 acc1 = {0.f}; + float4 acc2 = {0.f}; for (int l = 0; l < n; ++l) { uint8_t h = qh[l]; - acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0)); - acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0)); - acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0)); - acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0)); + acc1[0] += yl[l+0] * (q1[l] & 0x0F); + acc1[1] += yl[l+8] * (q1[l] & 0xF0); + acc1[2] += yh[l+0] * (q2[l] & 0x0F); + acc1[3] += yh[l+8] * (q2[l] & 0xF0); + acc2[0] += h & hm1 ? yl[l+0] : 0.f; + acc2[1] += h & hm2 ? yl[l+8] : 0.f; + acc2[2] += h & hm3 ? yh[l+0] : 0.f; + acc2[3] += h & hm4 ? yh[l+8] : 0.f; } const float dall = dh[0]; const float dmin = dh[1]; - sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) - + sumf[row] += dall * (sc8[0] * (acc1[0] + 16.f*acc2[0]) + + sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) + + sc8[4] * (acc1[2] + 16.f*acc2[2]) + + sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) - dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); q1 += step; @@ -1772,29 +1875,34 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) template void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const half d = il ? (xb->d / 16.h) : xb->d; - const half m = il ? ( -8.h * 16.h) : -8.h; + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = il ? 0xF000 : 0x0F00; + const ushort mask1 = mask0 << 8; for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d; - reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d; + reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; } + } template void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const half d = il ? (xb->d / 16.h) : xb->d; - const half m = xb->m; + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = il ? 0xF000 : 0x0F00; + const ushort mask1 = mask0 << 8; for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m; - reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m; + reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; } } @@ -1830,7 +1938,7 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg template void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { - const float d_all = (float)(xb->d); + const half d_all = xb->d; device const uint8_t * q = (device const uint8_t *)xb->qs; device const uint8_t * h = (device const uint8_t *)xb->hmask; device const int8_t * scales = (device const int8_t *)xb->scales; @@ -1843,17 +1951,20 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg ((il/4)>0 ? 12 : 3); uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; - int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \ - (scale_2&kmask2) | ((scale_1&kmask1) << 4); - float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) + : (scale_2&kmask2) | ((scale_1&kmask1) << 4); + half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h); + const half ml = 4.h * dl; - il = (il/2)%4; - float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + il = (il/2) & 3; + const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl *= coef; for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef)); + reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); } + #else float kcoef = il&1 ? 1.f/16.f : 1.f; uint16_t kmask = il&1 ? 0xF0 : 0x0F; @@ -1867,31 +1978,37 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg #endif } +static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { + return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} + : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; +} + template void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { - device const uint8_t * q = xb->qs; + device const uchar * q = xb->qs; #if QK_K == 256 - const float d = (float)(xb->d); - const float min = (float)(xb->dmin); short is = (il/4) * 2; q = q + (il/4) * 32 + 16 * (il&1); - il = il%4; - const uchar4 sc = get_scale_min_k4(is, xb->scales); - const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; - const float ml = il<2 ? min * sc[1] : min * sc[3]; + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const half d = il < 2 ? xb->d : xb->d / 16.h; + const half min = xb->dmin; + const half dl = d * sc[0]; + const half ml = min * sc[1]; #else q = q + 16 * (il&1); device const uint8_t * s = xb->scales; device const half2 * dh = (device const half2 *)xb->d; const float2 d = (float2)dh[0]; const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; - const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4); + const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4); #endif const ushort mask = il<2 ? 0x0F : 0xF0; for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * (q[i] & mask) - ml; } + } template @@ -1900,19 +2017,19 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg device const uint8_t * qh = xb->qh; #if QK_K == 256 - const float d = (float)(xb->d); - const float min = (float)(xb->dmin); short is = (il/4) * 2; q = q + 32 * (il/4) + 16 * (il&1); qh = qh + 16 * (il&1); uint8_t ul = 1 << (il/2); - il = il%4; - const uchar4 sc = get_scale_min_k4(is, xb->scales); - const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; - const float ml = il<2 ? min * sc[1] : min * sc[3]; + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const half d = il < 2 ? xb->d : xb->d / 16.h; + const half min = xb->dmin; + const half dl = d * sc[0]; + const half ml = min * sc[1]; - const ushort mask = il<2 ? 0x0F : 0xF0; - const float qh_val = il<2 ? 16.f : 256.f; + const ushort mask = il<2 ? 0x0F : 0xF0; + const half qh_val = il<2 ? 16.h : 256.h; for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; } @@ -1931,7 +2048,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg template void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { - const float d_all = (float)(xb->d); + const half d_all = xb->d; device const uint8_t * ql = (device const uint8_t *)xb->ql; device const uint8_t * qh = (device const uint8_t *)xb->qh; device const int8_t * scales = (device const int8_t *)xb->scales; @@ -1939,19 +2056,21 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg #if QK_K == 256 ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1); - float sc = scales[(il%2) + 2 * ((il/2))]; - il = (il/2)%4; + half sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2) & 3; #else ql = ql + 16 * (il&1); - float sc = scales[il]; + half sc = scales[il]; #endif + const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; + const half coef = il>1 ? 1.f/16.h : 1.h; + const half ml = d_all * sc * 32.h; + const half dl = d_all * sc * coef; for (int i = 0; i < 16; ++i) { - uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; - const float coef = il>1 ? 1.f/16.f : 1.f; - float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \ - ((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef; - reg[i/4][i%4] = d_all * sc * q * coef; + const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) + : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); + reg[i/4][i%4] = dl * q - ml; } } diff --git a/ggml.c b/ggml.c index 1dc493783..9ed9f678d 100644 --- a/ggml.c +++ b/ggml.c @@ -1,4 +1,3 @@ -#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" @@ -47,6 +46,10 @@ // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) #pragma warning(disable: 4244 4267) + +// disable POSIX deprecation warnigns +// these functions are never going away, anyway +#pragma warning(disable: 4996) #endif #if defined(_WIN32) @@ -103,6 +106,9 @@ typedef void * thread_ret_t; #include #include +#endif +#ifdef GGML_USE_CPU_HBM +#include #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -192,9 +198,15 @@ typedef void * thread_ret_t; #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else inline static void * ggml_aligned_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + return NULL; + } void * aligned_memory = NULL; -#ifdef GGML_USE_METAL - int result = posix_memalign(&aligned_memory, getpagesize(), size); +#ifdef GGML_USE_CPU_HBM + int result = hbw_posix_memalign(&aligned_memory, 16, size); +#elif GGML_USE_METAL + int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); #else int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); #endif @@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) { return aligned_memory; } #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#ifdef GGML_USE_CPU_HBM +#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) +#else #define GGML_ALIGNED_FREE(ptr) free(ptr) #endif +#endif #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -294,12 +310,14 @@ typedef double ggml_float; #if defined(_MSC_VER) || defined(__MINGW32__) #include #else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) #if !defined(__riscv) #include #endif #endif #endif #endif +#endif #ifdef __riscv_v_intrinsic #include @@ -4567,6 +4585,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { return NULL; } + // allow to call ggml_init with 0 size + if (params.mem_size == 0) { + params.mem_size = GGML_MEM_ALIGN; + } + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); *ctx = (struct ggml_context) { @@ -4769,7 +4792,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t obj_alloc_size = 0; - if (view_src == NULL && ctx->no_alloc == false) { + if (view_src == NULL && !ctx->no_alloc) { if (ctx->scratch.data != NULL) { // allocate tensor data in the scratch buffer if (ctx->scratch.offs + data_size > ctx->scratch.size) { @@ -5470,7 +5493,7 @@ static struct ggml_tensor * ggml_mul_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5513,7 +5536,7 @@ static struct ggml_tensor * ggml_div_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -18854,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking( // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) return count; } - return count; } } @@ -19957,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p struct ggml_tensor * data = NULL; - if (params.no_alloc == false) { + if (!params.no_alloc) { data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); ok = ok && data != NULL; @@ -19998,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } // point the data member to the appropriate location in the binary blob using the tensor infos - if (params.no_alloc == false) { + if (!params.no_alloc) { //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data } diff --git a/llama.cpp b/llama.cpp index 1b30b23b2..db47c5101 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,8 +1,3 @@ -// Defines fileno on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "llama.h" #include "ggml.h" @@ -127,6 +122,9 @@ void replace_all(std::string & s, const std::string & search, const std::string } s = std::move(result); } +#ifdef GGML_USE_CPU_HBM +#include +#endif static void zeros(std::ofstream & file, size_t n) { char zero = 0; @@ -451,6 +449,9 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * #elif GGML_USE_METAL # define llama_host_malloc(n) ggml_metal_host_malloc(n) # define llama_host_free(data) ggml_metal_host_free(data) +#elif GGML_USE_CPU_HBM +# define llama_host_malloc(n) hbw_malloc(n) +# define llama_host_free(data) if (data != NULL) hbw_free(data) #else # define llama_host_malloc(n) malloc(n) # define llama_host_free(data) free(data) @@ -607,16 +608,16 @@ struct llama_mmap { if (prefetch > 0) { // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { + fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", strerror(errno)); } } if (numa) { // advise the kernel not to use readahead // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) { + fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", strerror(errno)); } } @@ -1495,7 +1496,11 @@ struct llama_model_loader { // allocate temp buffer if not using mmap if (!use_mmap && cur->data == NULL) { GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); - cur->data = malloc(ggml_nbytes(cur)); + #ifdef GGML_USE_CPU_HBM + cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur)); + #else + cur->data = (uint8_t*)malloc(ggml_nbytes(cur)); + #endif } load_data_for(cur); @@ -3062,33 +3067,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; } -static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) { - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; -} - -static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) { - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED; -} - static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; } -static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) { - GGML_ASSERT(llama_is_control_token(vocab, id)); - return id == vocab.special_bos_id; -} - -static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) { - GGML_ASSERT(llama_is_control_token(vocab, id)); - return id == vocab.special_eos_id; -} - -static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) { - GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id)); - return id == vocab.special_pad_id; -} - static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto& token_data = vocab.id_to_token.at(id); @@ -4813,9 +4795,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector workers; std::mutex mutex; +#ifdef GGML_USE_K_QUANTS auto use_more_bits = [] (int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; }; +#endif int idx = 0; @@ -5662,15 +5646,19 @@ void llama_free(struct llama_context * ctx) { } int llama_n_vocab(const struct llama_context * ctx) { - return ctx->model.vocab.id_to_token.size(); + return llama_model_n_vocab(&ctx->model); } int llama_n_ctx(const struct llama_context * ctx) { - return ctx->model.hparams.n_ctx; + return llama_model_n_ctx(&ctx->model); +} + +int llama_n_ctx_train(const struct llama_context * ctx) { + return llama_model_n_ctx_train(&ctx->model); } int llama_n_embd(const struct llama_context * ctx) { - return ctx->model.hparams.n_embd; + return llama_model_n_embd(&ctx->model); } enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) { @@ -5685,6 +5673,10 @@ int llama_model_n_ctx(const struct llama_model * model) { return model->hparams.n_ctx; } +int llama_model_n_ctx_train(const struct llama_model * model) { + return model->hparams.n_ctx_train; +} + int llama_model_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } @@ -5960,7 +5952,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> ctx->rng; - GGML_ASSERT(rng_ss.fail() == false); + GGML_ASSERT(!rng_ss.fail()); } // set logits diff --git a/llama.h b/llama.h index 5b95aaa87..37975bebe 100644 --- a/llama.h +++ b/llama.h @@ -245,15 +245,17 @@ extern "C" { LLAMA_API bool llama_mmap_supported (void); LLAMA_API bool llama_mlock_supported(void); - LLAMA_API int llama_n_vocab(const struct llama_context * ctx); - LLAMA_API int llama_n_ctx (const struct llama_context * ctx); - LLAMA_API int llama_n_embd (const struct llama_context * ctx); + LLAMA_API int llama_n_vocab (const struct llama_context * ctx); + LLAMA_API int llama_n_ctx (const struct llama_context * ctx); + LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx); + LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx); - LLAMA_API int llama_model_n_vocab(const struct llama_model * model); - LLAMA_API int llama_model_n_ctx (const struct llama_model * model); - LLAMA_API int llama_model_n_embd (const struct llama_model * model); + LLAMA_API int llama_model_n_vocab (const struct llama_model * model); + LLAMA_API int llama_model_n_ctx (const struct llama_model * model); + LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model); + LLAMA_API int llama_model_n_embd (const struct llama_model * model); // Get a string describing the model type LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);